#!/usr/bin/ruby -w

if (ARGV.length != 3)
	STDERR.puts "Usage: #{$0} hhmm-perplexity-log first-srilm-ngram-perplexity-log second-srilm-ngram-perplexity-log" 
	exit(-1)
end


prob = Hash.new{|k,v| k[v]=Hash.new}

word_count = Array.new
oov_count = Array.new


hhmm_index=0
File.open(ARGV[0]).each_line { |line|

	if line =~ /Prior probability of sentence:\s*(\S+)/
		prob[:hhmm][hhmm_index] = $1.strip.to_f
		hhmm_index+=1
	elsif line=~/Sentence has zero probability/
		prob[:hhmm][hhmm_index] = 0.0
		hhmm_index+=1 
	elsif line =~ /Number of words in sentence:\s*(\S+)/
		word_count[hhmm_index-1] = $1.strip.to_i
	elsif line =~ /Number of OOV words in sentence:\s*(\S+)/
		oov_count[hhmm_index-1] = $1.strip.to_i
	end
}

puts "Read #{hhmm_index} lines with probability info in HHMM file"


ngram1_index=0
File.open(ARGV[1]).each_line { |line|

	if line =~ /zeroprobs, logprob=\s*(.*?)\s+/
		prob[:firstngram][ngram1_index] = 10**($1.strip.to_f)
		ngram1_index+=1 
	end
}

puts "Read #{ngram1_index-1} lines with probability info in first SRILM file"


ngram2_index=0
File.open(ARGV[2]).each_line { |line|

	if line =~ /zeroprobs, logprob=\s*(.*?)\s+/
		prob[:secondngram][ngram2_index] = 10**($1.strip.to_f)
		ngram2_index+=1 
	end
}


puts "Read #{ngram2_index-1} lines with probability info in SRILM file"


if (ngram1_index != ngram2_index)
	STDERR.puts "Line counts differ: #{ngram1_index} versus #{ngram2_index}"
	exit(-2)
end

if (ngram1_index != prob[:firstngram].length) 
	STDERR.puts "Line counts differ: #{ngram1_index} versus #{prob[:firstngram].length}"
	exit(-2)
end

if (ngram2_index != prob[:secondngram].length) 
	STDERR.puts "Line counts differ: #{ngram1_index} versus #{prob[:secondngram].length}"
	exit(-2)
end


weight=0.36

usable_word_count = 0
usable_oov_count = 0
usable_logprob_sum = 0

ngram1_logprob_sum = 0
ngram2_logprob_sum = 0

word_count.each_with_index{|w,i| puts "word_count[#{i}]=#{w}"}

0.upto(prob[:firstngram].length-2) { |i|

#	puts "Sentence #{i}	word_count=#{word_count[i]}	oov=#{oov_count[i]}	p_HHMM(sent #{i})=#{prob[:firstngram][i]}	p_ngram(sent #{i})=#{prob[:secondngram][i]}"

	if (prob[:hhmm][i] > 0 && prob[:firstngram][i] > 0  && prob[:secondngram][i] > 0) 
	
		usable_logprob_sum += Math.log(weight*prob[:secondngram][i] + (1 - weight)*prob[:firstngram][i])
		usable_word_count  += word_count[i]
		usable_oov_count   += oov_count[i]		

		ngram1_logprob_sum   += Math.log(prob[:firstngram][i])
		ngram2_logprob_sum  += Math.log(prob[:secondngram][i])
	end

}

puts
puts

ppl        = Math::E**(-ngram2_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-ngram2_logprob_sum / (usable_word_count))
puts "First N-gram logprob = #{ngram2_logprob_sum}	words = #{usable_word_count}"
puts "First N-gram perplexity = #{ppl}	#{ppl_no_oov}"
puts

ppl = Math::E**(-ngram1_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-ngram1_logprob_sum / (usable_word_count))
puts "Second N-gram logprob = #{ngram1_logprob_sum}	words = #{usable_word_count}"
puts "Second N-gram perplexity = #{ppl}	#{ppl_no_oov}"
puts

ppl = Math::E**(-usable_logprob_sum / (usable_word_count - usable_oov_count))
ppl_no_oov = Math::E**(-usable_logprob_sum / (usable_word_count))
puts "Interpolated logprob = #{usable_logprob_sum}	words = #{usable_word_count}"
puts "Interpolated perplexity = #{ppl}	#{ppl_no_oov}"
puts