###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

#####################################################################
# noisyout2surprisal.rb
# 
# calculates surprisal at the end of each word 
#
# TO RUN: 
#  cat <filename> | bin/hhmmparser-hdwd -s scratch/genmodel/QF.wsjnphd.model scratch/genmodel/POS.wsjnphd.model scratch/genmodel/LMod.wsjnphd.model > surprisalparser.out
#  cat surprisalparser.out | ruby scripts/noisyout2surprisal.rb
#
# OPTIONS:
#  -f for the prefix at each time step, output as float
#  -l for the prefix at each time step, output as logprob
#  -s output the surprisal at each time step
#  -p calculate the per-word perplexity (with the best parse)
#  -P calculate the per-word perplexity (with beam of parses)
#
######################################################################


##### parse options
require 'optparse'

options = {}
OptionParser.new do |opts|
  opts.banner = "Usage: ruby noisyout2surprisal.rb [options]"

  opts.on("-f", "--fprefix", "List prefix in float") do |v|
    options[:fprefix] = v
  end
  opts.on("-l", "--lprefix", "List prefix in logprob") do |v|
    options[:lprefix] = v
  end
  opts.on("-1", "--oneline", "List comma-separated statistics w/o words, 1 line per word") do |v|
    options[:oneline] = v
  end
  opts.on("-s", "--surprisal", "Calculate surprisal (need sum=e^x)") do |v|
    options[:surprisal] = v
  end
  opts.on("-p", "--perplexity", "Calculate perplexity with argmax") do |v|
    options[:mlsperp] = v
  end
  opts.on("-P", "--Perplexity", "Calculate perplexity with beam") do |v|
    options[:beamperp] = v
  end
  opts.on("-e", "--entropyreduction", "Calculate entropy reduction (need entropy=e^x)") do |v|
    options[:entropy] = v
  end
  opts.on("-d", "--stackdepth", "Display average stack depth (need avg(d)=x") do |v|
    options[:depth] = v
  end
end.parse!


##### turn very-noisy outputs of hhmmparser.hdwd into prefix probabilities, then surprisal
sentcount = 1;
words = [sentcount.to_s+" "]  # first element of "words" is sentence number
prefix = []                   # first element of "prefix" is nil
surprisal = []                # first element of "surprisal" is nil
entropy = [0]
entropyreduction = [0]
depth = []
mlsprob = []
totneglogprob = 0
totwords = 0

outputheaders = []
if options[:oneline]
  if options[:fprefix]
    outputheaders.push("fprefix")
  end
  if options[:lprefix]
    outputheaders.push("lprefix")
  end
  if options[:surprisal]
    outputheaders.push("surprisal")
  end
  if options[:entropy]
    outputheaders.push("entropyrdc")
  end
  if options[:depth]
    outputheaders.push("avgdepth")
  end
  print outputheaders.join(',')+"\n"
end

while (line = STDIN.gets)
  parts = line.split(' ')

  # beam elements w/ sums, count up probs
  if parts[0].include?('f=') && parts[1].include?('sum')
    frame = parts[0].split('=')[1].to_i
    prob = Math.exp(parts[1].split('^')[1].to_f)
    prefix[frame] = prob + (prefix[frame]==nil ? 0.0 : prefix[frame])
    #print frame.to_s+" "+beam.to_s+" "+parts[7].split('^')[1]+" = "+prob.to_s+"\n"
  end

  # beam elements w/ entropies
  if parts[0].include?('f=') && parts[1].include?('entropy') && options[:entropy]
    frame = parts[0].split('=')[1].to_i
    entropy[frame] = Math.exp(parts[1].split('^')[1].to_f)
  end

  # beam elements w/ average depths
  if parts[0].include?('f=') && parts[1].include?('avg(d)') && options[:depth]
    frame = parts[0].split('=')[1].to_i
    depth[frame] = parts[1].split('=')[1].to_f
  end

  # hypoths, store words
  if parts[0].include?('HYPOTH')
    if parts[3] != 'eos' || !options[:oneline]# !options[:surprisal]
      words.push(parts[3]+" ")
    end
    if parts[3] == 'eos'
      mlsprob[sentcount] = parts[5][1,parts[5].length-2].to_f/100
    end
    #end
  end

  # end of sentence, print out stats, start new sentence
  if line.include?('------')
    if !options[:oneline]
      print words[0]
    end

    lastwordind = options[:oneline] ? prefix.length-2 : prefix.length-1
    for i in (1..lastwordind)
      ldelim=''
      rdelim="\n"
      if !options[:oneline]
	print words[i]
	ldelim='('
	rdelim=') '
      end

      outputstats = []
      if options[:fprefix]
	#print ldelim + prefix[i].to_s + rdelim
        outputstats.push(prefix[i].to_s)
      end
      if options[:lprefix]
	#print ldelim + (-Math.log(prefix[i])).to_s + rdelim
        outputstats.push( (-Math.log(prefix[i])).to_s );
      end
      if options[:surprisal]
        surprisal[i] = -Math.log(prefix[i]) + Math.log( (i==1 ? 1.0 : prefix[i-1]) )
        #print ldelim + surprisal[i].to_s + rdelim
        outputstats.push( surprisal[i] )
      end
      if options[:entropy]
        entropyreduction[i] = entropy[i] - entropy[i-1]
        outputstats.push( entropyreduction[i] )
      end
      if options[:depth]
        outputstats.push( depth[i] )
      end
      print ldelim + outputstats.join(',') + rdelim
    end

    if options[:mlsperp] || options[:beamperp]
      # normalize by string length, + eos
      sentlength = words.length-1
      totwords += sentlength

      if options[:mlsperp]
	if (mlsprob[sentcount]!=nil)
	  wordneglogprob = -mlsprob[sentcount]/sentlength
	  totneglogprob += wordneglogprob*sentlength
	else
	  wordneglogprob = 0
	  sentlength = 0
	end

      elsif options[:beamperp]
	wordneglogprob = -Math.log(prefix[prefix.length-1])/sentlength
	totneglogprob += wordneglogprob*sentlength

      end

      print " = e^-"+wordneglogprob.to_s+"("+sentlength.to_s+")\n" 

    end

    if !options[:oneline]
      print "\n"
    end

    # reset for next sentence
    sentcount+=1
    words = [sentcount.to_s+" "]
    prefix = []
    surprisal = []
  end

end

if options[:mlsperp] || options[:beamperp]
  perplexity = 2**(totneglogprob/totwords)
  print (totneglogprob/totwords).to_s+" => perp(corpus)="+perplexity.to_s+"\n"
end
