###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

## Gets counts from trees for POS tagger model, gets
## individual word counts, indiv. tag counts, tag/word pair counts.
## Also build word classes based on morphology and spelling and other factors.
## For details see stanford parser package: parser.lexparser., classes Options,
## EnglishUnknownWordModel, UnknownWordModel.  The model here is what they call
## level 5, with english-specific unknown word features.

require 'scripts/umnlp.rb'
require 'jcode' #need for each_char


$seenw_hash = CounterHash.new # Seen words
$unseenw_hash = CounterHash.new # Unseen words
$seent_hash = CounterHash.new # Seen tags
$seen_hash = CounterHash.new # Seen Word/tag pairs
$unseent_hash = CounterHash.new # Tags for unseen words
$unk_hash = CounterHash.new
$unkt_hash = CounterHash.new
#$tag_hash = Hash.new
$unk_thresh = ARGV[0].to_i

class WordTag
  attr_accessor :word, :tag
  def initialize(w=nil, t=nil)
    @word = w
    @tag = t
  end
end

class Tree
  def getWordsTags
   begin
    ra = Array.new
    if @children.size == 0
      @head =~ /(.*)\#(.*)/
      ra << WordTag.new($2, $1)
      return ra
    end
    @children.each{ |child|
      ra << child.getWordsTags
      ra.flatten!
    }
    return ra
   rescue Exception
     $stderr.puts "Exception caught when tree = #{to_s}: " + $!
   end
  end
end

line_num = 0
unk_c = 0
total = 0

while(line = $stdin.gets)
  line_num += 1
  t = Tree.new(line)
  wtra = t.getWordsTags
  wtra.each_index{ |i|
    wordtag = wtra[i]
    word = wordtag.word
    tag = wordtag.tag
    $seen_hash["#{word}__#{tag}"] += 1
    if word == "" or tag == ""
      next
    end
    total += 1
    $seent_hash[tag] += 1
    if $seenw_hash.has_key?(word) or line_num < $unk_thresh
      $seenw_hash[word] += 1
#      $seent_hash[tag] = 1
    elsif not $seenw_hash.has_key?(word) and line_num > $unk_thresh
      $seenw_hash[word] += 1
      unk_str = "UNK"
      wlen = word.size()
      numCaps = 0
      hasDigit = false
      hasDash = false
      hasLower = false
      word.each_char{ |c|
        if c[0] >= 48 and c[0] <= 57
          hasDigit = true
        elsif c[0].chr == "-"
          hasDash = true
        elsif c[0] >= 65 and c[0] <= 90
          numCaps +=1 
        elsif c[0] >= 91 and c[0] <= 122
          hasLower = true
        end
      }
      c = word[0,1]
      lowered = word.downcase
      if c[0] >= 65 and c[0] <= 90
        if i == 0 and numCaps == 1
          unk_str += "-INITC"
          if $seenw_hash.has_key?(lowered)
            unk_str += "-KNOWNLC"
          end
        else
          unk_str += "-CAPS"
        end
      elsif numCaps > 0
        unk_str += "-CAPS"
      elsif hasLower
        unk_str += "-LC"
      end
      if hasDigit
        unk_str += "-NUM"
      end
      if hasDash
        unk_str += "-DASH"
      end
      if lowered[-1,1] == "s" and wlen >= 3
        ch2 = lowered[-2,1]
        if ch2 != "s" and ch2 != "i" and ch2 != "u"
          unk_str += "-s"
        end
      elsif wlen >= 5 and !hasDash and !(hasDigit and numCaps > 0)
        if lowered =~ /ed$/
          unk_str += "-ed"
        elsif lowered =~ /ing$/
          unk_str += "-ing"
        elsif lowered =~ /ion$/
          unk_str += "-ion"
        elsif lowered =~ /er$/
          unk_str += "-er"
        elsif lowered =~ /est$/
          unk_str += "-est"
        elsif lowered =~ /ly$/
          unk_str += "-ly"
        elsif lowered =~ /ity$/
          unk_str += "-ity"
        elsif lowered =~ /y$/
          unk_str += "-y"
        elsif lowered =~ /al$/
          unk_str += "-al"
        end
      end
      $unkt_hash["#{unk_str}__#{tag}"] += 1
      $unseent_hash[tag] += 1
      $unseenw_hash[unk_str] += 1
       unk_c += 1
    end #not seen
  }
end

#total = $seenw_hash.size()
smooth = 1.0 # Value output by stanford-parser run in verbose mode...

#puts "unseent_hash.size=#{$unseent_hash.size()}"
$unseenw_hash.each{ |key, value|
  p_t = 0.0
#  puts "Examining key=#{key}, value=#{value}"
  $seent_hash.each_key{ |tag|
#    total = $seenw_hash
    c_TS = $unkt_hash["#{key}__#{tag}"]
    c_S = value
    c_U = unk_c
    c_T = $unseent_hash[tag]
    c_Tseen = $seent_hash[tag]
    p_T_U = c_T.to_f / c_U.to_f
    pb_T_S = (c_TS.to_f + smooth * p_T_U.to_f) / (c_S.to_f + smooth)
    if pb_T_S > 1
      $stderr.puts "key=#{key}, c_TS=#{c_TS}, c_S=#{c_S}"
    end
    p_T = (c_Tseen.to_f / total.to_f)
    #p_T = c_T.to_f / unk_c
    p_W = 1.0 / total.to_f
    p_TW = pb_T_S.to_f * p_W.to_f
    pb_W_T = pb_T_S.to_f * p_W.to_f / p_T.to_f
    p_t += pb_W_T.to_f
    if pb_W_T > 0.0
      #puts "#{tag} : #{key} : c_TS=#{c_TS} and c_S=#{c_S}, c_U=#{c_U}, c_T=#{c_T}, c_Tseen=#{c_Tseen}, total=#{total}"
      printf "UNK #{tag} : #{key} = %.12f\n", (pb_W_T) #"#, p_T=#{p_T}, p_W=#{p_W}, p_TW=#{p_TW}, pb_T_S=#{pb_T_S}"
    end
  }
#  puts "Sanity check: p_t = #{p_t} \n\n"
}


smoothInUnknownsThreshold = 100 ## Seen in training script for stanford-parser
smooth_seen = 0.2 ## Seen in verbose output training stanford-parser

$seenw_hash.each{ |word,count|
  ## If we've seen this word enough, just divide by # of tags
  if count > smoothInUnknownsThreshold
    $seent_hash.each_key{ |tag|
      wt_count = $seen_hash["#{word}__#{tag}"]
      if wt_count > 0
        pb_T_W = (wt_count.to_f / count.to_f)
        #$stderr.puts "word=#{word}, tag=#{tag}, pb_T_W=#{pb_T_W}"
        printf "Pw %s : #{tag} = %.12f\n", word, (pb_T_W)
      end
    }
  else
    ## Mix in with priors on tags
    $seent_hash.each_key{ |tag|
      wt_count = $seen_hash["#{word}__#{tag}"]
      #if wt_count == 0
      #  next
      #end
      c_Tunseen = $unseent_hash[tag]
      totalUnseen = unk_c
      p_T_U = c_Tunseen.to_f / totalUnseen.to_f;
      pb_T_W = (wt_count.to_f + smooth_seen * p_T_U.to_f) / (count.to_f + smooth_seen);
      if pb_T_W > 0.000000000001
        printf "Pw %s : #{tag} = %.12f\n", word, pb_T_W
      end
    }
  end
}

$seent_hash.keys.sort.each{ |key|
  value = $seent_hash[key]
  printf "P : #{key} = %.12f\n" , (value.to_f / total.to_f)
}

$seenw_hash.keys.sort.each{ |key|
  value = $seenw_hash[key]
  printf "W : #{key} = %d\n", value.to_i
}


