###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

#####################################################################
# similarityvectors.rb
# 
# generates a similarity vector from a hhmmparser-hdwd output. 
#
# TO RUN: 
#  cat similarityparser.out | ruby scripts/similarityvectors.rb scratch/genmodel/LMod.wsjnphd.model
#
######################################################################

##### utilities
class NestedHash < Hash

  def initialize
    blk = lambda {|h,k| h[k] = Hash.new(&blk)}
    super(&blk)
  end
  
end


#require 'narray'
class Array

  def * (v)  # dot product
    u = self

    # normalize sizes
    if self.length != v.length
      if self.length > v.length
	for i in (v.length..u.length-1)
	  v[i] = 0
	end
      else # v.length > self.length
	for i in (u.length..v.length-1)
	  u[i] = 0
	end
      end
    end

    # do dot product -- iterate & multiply, then add
    sum = 0
    for i in 0...u.size
      sum += u[i] * v[i]
    end
    return sum

  end

end


def cossimilarity(v1,v2)
  return v1*v2
end



##### model structures
class LMod
  
  attr_accessor :model

  def initialize(lines={})
    @model=NestedHash.new()
    lines.each { |line|
      if line.include?('LM ')
	splitline = line.split(' ');
	@model[splitline[1]][splitline[3]] = splitline[5].to_f
      end
    }
  end

  def to_s
    @model.each { |src,targhash|
      targhash.each { |targ,prob|
	print src+" : "+targ+" = "+prob.to_s+"\n"
      }
    }
  end

end
lmodlines = IO.readlines(ARGV[0])
$lmod = LMod.new(lmodlines)
# $lmod.to_s

class HashVector

  attr_accessor :data
  attr_accessor :scaler
  attr_accessor :verb
  attr_accessor :subj
  
  def initialize(verb="",subj="",scaler=1.0)
    @verb = verb
    @subj = subj
    @scaler = Math.exp( scaler[0].to_f/100.0 ) # scaler[0].to_i # 
    @data = $lmod.model[@verb]
  end

end

class Hash

  ## sort and return a hash
#  def sort_hash
#    hash = Hash.new()
#    array = self.sort
#    array.each { |kv|
#      hash[kv[0]] = kv[1]
#    }
#    return hash
#  end

  ## get an array with indices sorted by hash's indices
  def to_array()
    array = []
    tmparray = self.sort
    tmparray.each { |v|
      array.push(v[1])
    }
    return array
  end

  ## get an array, but with zero-padding where some ref. vector had values while self did not
  def to_array2(refv)
    array = []
    tmphash = (self).merge(refv) { |key, v1, v2| v1 }
    refv.each { |k,v|
      if !self.key?(k)
	tmphash[k]=0
      end
    }
    tmparray = tmphash.sort
    tmparray.each { |v|
      array.push(v[1])
    }
    return array
  end

end

#g = {"j"=>10,"a"=>3,"c"=>6}
#h = {"k"=>10,"a"=>15,"c"=>6}
#print g.to_array()
#print "\n"
#print g.to_array2(h)
#print "\n"

#class VectorsToCompare
# 
#  attr.accessor :vectors
# 
#  def initialize(vectors={})
#    @vectors = Hash.new()
#  end
# 
#end




##### main

vectorsToCompare = NestedHash.new() #VectorsToCompare.new()

prevprevprevparts = ''
prevprevparts = ''
prevparts = ''
parts = ''

##### turn hypoths into vectors, stored in a hash
while (line = STDIN.gets)
  parts = line.split(' ')

  if line.include?('------') && !( prevparts[0].include?('-') )
    prevprevprevword = prevprevprevparts[3]
    prevprevword = prevprevparts[3]
    #print prevprevword
    prevrstate = prevparts[2].split(';');
    logprob = prevrstate[1].scan(/([\-0-9]+)/)[0]
    #print logprob
    v = HashVector.new(prevprevword,prevprevprevword,logprob)

    vectorsToCompare[v.subj][v.verb] = v;
    #print v.verb+" "+v.scaler.to_s+"\n"
  end

  prevprevprevparts = prevprevparts
  prevprevparts = prevparts
  prevparts = parts
end

#vectorsToCompare.each { |d1,d2|
#  d2.each { |d2,d3|
#    print "  NN:"+d1+" VBD:"+d2+" v.scaler="+d3.scaler.to_s+"\n"
#  }
#}



##### do comparisons on vectors of the same subject (tournament-style)

results = NestedHash.new()

# this will do all comparisons in both directions.  
# you'll have to pick the right ones for non-symmetric measures.
vectorsToCompare.each { |subj,verb2vect|
  verb2vect.each { |verb,vector1|
    verb2vect.each { |verb,vector2|
      if vector1!=vector2
	v1 = vector1.data.to_array2(vector2.data)
	v2 = vector2.data.to_array2(vector1.data)
	results[subj][vector1.verb][vector2.verb] = cossimilarity(v1,v2)
      end
    }
  }
}
   


results.each { |d1,d2|
  d2.each { |d2,d3|
    d3.each { |d3,d4|
      print "  NN:"+d1+" VBD1:"+d2+" VBD2:"+d3+" sim="+d4.to_s+"\n"
    }
  }
}
