#!/usr/bin/ruby -w

if (ARGV.length != 2)
	STDERR.puts "You used #{ARGV.length} arguments"
	STDERR.puts
	STDERR.puts "Usage: #{$0} hhmm-perplexity-log srilm-ngram-perplexity-log" 
	exit(-1)
end


prob = Hash.new{|k,v| k[v]=Hash.new}

word_count = Array.new
oov_count = Array.new

hhmm_index=0
File.open(ARGV[0]).each_line { |line|

	if line =~ /Prior probability of sentence:\s*(\S+)/
		prob[:hhmm][hhmm_index] = $1.strip.to_f
		hhmm_index+=1
#	elsif line=~/Sentence has zero probability/
#		prob[:hhmm][hhmm_index] = 0.0
#		hhmm_index+=1 
	elsif line =~ /Number of words in sentence:\s*(\S+)/
		word_count[hhmm_index-1] = $1.strip.to_i
	elsif line =~ /Number of OOV words in sentence:\s*(\S+)/
		oov_count[hhmm_index-1] = $1.strip.to_i
	end
}

puts "Read #{hhmm_index} lines with probability info in HHMM file"


ngram_index=0
File.open(ARGV[1]).each_line { |line|

	if line =~ /zeroprobs, logprob=\s*(.*?)\s+/
		prob[:ngram][ngram_index] = 10**($1.strip.to_f)
		ngram_index+=1 
	end
}


puts "Read #{ngram_index-1} lines with probability info in SRILM file"


if (ngram_index-1 != hhmm_index)
	STDERR.puts "Line counts differ: #{hhmm_index} versus #{ngram_index}"
	exit(-2)
end

if (hhmm_index != prob[:hhmm].length) 
	STDERR.puts "Line counts differ: #{hhmm_index} versus #{prob[:hhmm].length}"
	exit(-2)
end

if (ngram_index != prob[:ngram].length) 
	STDERR.puts "Line counts differ: #{ngram_index} versus #{prob[:ngram].length}"
	exit(-2)
end


weight=0.36

usable_word_count = 0
usable_oov_count = 0
usable_logprob_sum = 0

hhmm_logprob_sum = 0
ngram_logprob_sum = 0

0.upto(prob[:hhmm].length-1) { |i|

#	puts "Sentence #{i}	word_count=#{word_count[i]}	oov=#{oov_count[i]}	p_HHMM(sent #{i})=#{prob[:hhmm][i]}	p_ngram(sent #{i})=#{prob[:ngram][i]}"

	if (prob[:hhmm][i] > 0  && prob[:ngram][i] > 0) 

		usable_logprob_sum += Math.log(weight*prob[:ngram][i] + (1 - weight)*prob[:hhmm][i])
		usable_word_count  += word_count[i]
		usable_oov_count   += oov_count[i]		

		hhmm_logprob_sum   += Math.log(prob[:hhmm][i])
		ngram_logprob_sum  += Math.log(prob[:ngram][i])
	end

}

puts
puts

ppl        = Math::E**(-ngram_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-ngram_logprob_sum / (usable_word_count))
puts "N-gram logprob = #{ngram_logprob_sum}	words = #{usable_word_count}"
puts "N-gram perplexity = #{ppl}	#{ppl_no_oov}"
puts

ppl = Math::E**(-hhmm_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-hhmm_logprob_sum / (usable_word_count))
puts "HHMM logprob = #{hhmm_logprob_sum}	words = #{usable_word_count}"
puts "HHMM perplexity = #{ppl}	#{ppl_no_oov}"
puts

ppl = Math::E**(-usable_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-usable_logprob_sum / (usable_word_count))
puts "Interpolated logprob = #{usable_logprob_sum}	words = #{usable_word_count}"
puts "Interpolated perplexity = #{ppl}	#{ppl_no_oov}"
puts