###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

######################################################################
# calcHdwdTree-srl.rb
# 
# Annotates trees with headwords floating up by syntax rules
# TO RUN: cat genmodel/<file>.ucnftrees | ruby scripts/calcHdwdTree-srl.rb [-c] [-b] [-v]
#
# OPTIONS:
#  -c consider "unk" headword if its count < cut threshhold
#  -b only select the best number of words (based on their counts) for headword. 
#     the rest are "unk"
#  -v verbose mode
#
######################################################################

require "scripts/umnlp.rb"

##### parse options
require 'optparse'

$ctr=0
$line=""

$pbrLabel = "\\w+!pbrdelim!"
$semLabel = "\\w+!semdelim!"

$idLabel = "id!semdelim!"
$aLabel = "a\\d!semdelim!"
$mLabel = "m!semdelim!"

$options = {}
$options[:cut] = 0
$options[:best] = -1
$options[:verbose] = false

OptionParser.new do |opts|
  opts.banner = "Usage: ruby calcHdwdTree-srl.rb [options]"
  
  opts.on("-v", "--verbose", "turns on extra stderr output") do |v|
    $options[:verbose] = v
  end
  opts.on("-c", "--cut [freq]", Integer, "treat headword frequencies below the given number as unk") do |c|
    $options[:cut] = c
  end
  opts.on("-b", "--best [num]", Integer, "everything but the top frequent words is treated as unk") do |b|
    $options[:best] = (b.nil? ? 1000 : b)
  end
  end.parse!

  Hdwdrules = { # "0" indicates search R-to-L.  "1" indicates search L-to-R
    "ADJP" => "0 NNS QP NN $ ADVP JJ VBN VBG ADJP JJR NP JJS DT FW RBR RBS SBAR RB",
    "ADVP" => "1 RB RBR RBS FW ADVP TO CD JJR JJ IN NP JJS NN",
    "CONJP" => "1 CC RB IN",
    "FRAG" => "1 ",
    "INTJ" => "0 ",
#    "LST" => "1 LS :", we change penn category of ":" to be ","
    "LST" => "1 LS ,",
    "NAC" => "0 NN NNS NNP NNPS NP NAC EX $ CD QP PRP VBG JJ JJS JJR ADJP FW",
      # NP is dealt with separately
#    "PP" => "1 IN TO VBG VBN RP RB FW PP", #added RB after RP, and PP at the end
    "PP" => "1 NP S IN TO VBG VBN RP RB FW PP", #added RB after RP, and PP at the end. Added NP right before TO, could be even before IN. Added S(provbg)
    "PRN" => "0 ",
    "PRT" => "1 RP",
    "QP" => "0 $ IN NNS NN JJ RB DT CD NCD QP JJR JJS",
    "RRC" => "1 VP NP ADVP ADJP PP",
#    "S" => "0 TO IN VP S SBAR VB VBD VBN VBG VBP VBZ ADJP UCP NP", #added all the VB* ones after SBAR
    "S" => "0 VP TO IN S SBAR VB VBD VBN VBG VBP VBZ ADJP UCP NP", #added all the VB* ones after SBAR
    "SBAR" => "0 WHNP WHPP WHADVP WHADJP IN DT S SQ SINV SBAR WHSBAR FRAG", #added WHSBAR
    "SBARQ" => "0 SQ S SINV SBARQ FRAG",
    "SINV" => "0 VBZ VBD VBP VB MD VP S SINV ADJP NP",
    "SQ" => "0 VBZ VBD VBP VB MD VP SQ",
    "UCP" => "1 ",
#    "VP" => "0 TO VBD VP VBN MD VBZ VB VBG VBP ADJP NN NNS NP", #moved up VP, from after VBP to after VBD
    "VP" => "0 VP VBD VBN MD VBZ VB VBG VBP TO ADJP NN NNS NP", #moved up VP, prior to any VBX. move TO to after all the VXX
    "WHADJP" => "0 CC WRB JJ ADJP",
    "WHADVP" => "1 CC WRB",
    "WHNP" => "0 WDT WP WP$ WHADJP WHPP WHNP",
    "WHPP" => "1 IN TO FW",
      
      #add'l rules to count for previous binarization & processing
    "NN"   => "0 NN NNS NNP NNPS",
    "NNS"  => "0 NN NNS NNP NNPS",
    "NNP"  => "0 NN NNS NNP NNPS",
    "NNPS" => "0 NN NNS NNP NNPS",
    "VBZ"  => "1 VBZ VBD VBP VB VBN VBG",
    "VBD"  => "1 VBZ VBD VBP VB VBN VBG",
    "VBP"  => "1 VBZ VBD VBP VB VBN VBG",
    "VB"   => "1 VBZ VBD VBP VB VBN VBG",
    "VBN"  => "1 VBZ VBD VBP VB VBN VBG",
    "VBG"  => "1 VBZ VBD VBP VB VBN VBG",
    "JJ"   => "0 JJ JJR JJS",
    "JJR"  => "0 JJ JJR JJS",
    "JJS"  => "0 JJ JJR JJS",
    "RB"   => "0 RB RBR RBS",
    "RBR"  => "0 RB RBR RBS",
    "RBS"  => "0 RB RBR RBS",
    "CD"   => "0 CD",
    "IN"   => "1 IN",
    "LISTNP" => "1 NP",
    "LISTSBAR" => "1 SBAR",
    "LISTSINV" => "1 SINV",
    "LISTS"  => "1 S",
    "LISTVP" => "1 VP",
#    "WHSBAR" => "0 WHNP WHPP WHADVP WHADJP IN DT S SQ SINV SBAR FRAG VP", #taken from SBAR, added VP to the end
    "WHSBAR" => "0 S WHNP WHPP WHADVP WHADJP IN DT SQ SINV SBAR FRAG VP", #taken from SBAR, added VP to the end
    "WHSBARQ" => "0 SQ S SINV SBARQ FRAG",
  }
  
  Lastrules = Array["S","s","V","v","N","n","W","w","P","p","I","i","U","u","A","a","R","r","J","j","Q","q","C","c","L","l"];
      

  $hdwdfreqs = Hash.new(0)
  $hdwdranks = Hash.new(0)
  
  #####
  class Tree
    
    attr_accessor :hdwd
    attr_accessor :uschead
    
    def setHead
      if @head !~ /#{$idLabel}/ && $options[:verbose]
        $stderr.print $ctr, " expect id but got ", @head, "\n"
      end
    end
    
    def setModifier
      if @head =~ /#{$idLabel}/
        @head.sub!(/#{$idLabel}/, $mLabel)
      elsif $options[:verbose]
        $stderr.print $ctr, " expect m but got ", @head, "\n"
      end
    end
    
    ### headword percolation
    def percolate () #on binary trees only
      if @children.size==0
        $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + "\nLine=#{$line}\n"
        return
      end
      if @children.size==1 && @children[0].children.size==1 && @children[0].children[0].children.size!=0
        $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + " child:"+@children[0].head + " granch:"+@children[0].children[0].head + "\n"
        return
      end
      
      @uschead = @head
      
      ## terminal case
      if @children.size==1 && @children[0].children.size==0
        #      $stderr.print "Terminal : Head = "+@head+", Parent = "+@parent.head+"\n"
        @hdwd = @children[0].head.split('#',2)[1].split(':REL-',2)[0] # hijacked to give POS also. Leaf may have ":REL-say.01'
        @children[0].hdwd = @children[0].head
        #      $stderr.print "'Terminal: Head = "+@head+", Hdwd = "+@hdwd+"\n"
        $hdwdfreqs[@hdwd] = $hdwdfreqs[@hdwd]+1
        return
      end
      
      ## unary case
      if @children.size==1
        #      $stderr.print "Unary   :  Head = "+@head+", Parent = "+@parent.head+"\n"
        ## recurse to left (or unary) child...
        @children[0].percolate()
        @hdwd = @children[0].hdwd
        #      $stderr.print "'Unary  :  Head = "+@head+", Hdwd = "+@hdwd+"\n"
        $hdwdfreqs[@hdwd] = $hdwdfreqs[@hdwd]+1
        return
      end
      
      ## binary case
      if @children.size==2
        #      $stderr.print "Binary  :  Head = "+@head+", Parent = "+@parent.head+"\n"
        ##annotate underscore sections of trees with alternate head for underscores
        if @head.include?('_')
          @uschead = @parent.uschead
        end
        
        ## recurse to left (or unary) child...
        @children[0].percolate()
        ## recurse to right child...
        @children[1].percolate()
        
        ## head rules battle it out!  then count up the results
        @hdwd = hdwdbattle()
        
        #      $stderr.print "'Binary :  Head = "+@head+", Hdwd = "+@hdwd+"\n"
        $hdwdfreqs[@hdwd] = $hdwdfreqs[@hdwd]+1
        return
      end
      
      if @children.size>2 || @children.size<0
        $stderr.print "ERROR: calcHdwdTree.rb requires binary trees. Node "+@head+" has "+@children.size.to_s+" children.\n"
      end
      
    end
    
    #remove all the -argNP, -argVP, etc. at the end; then take the last group of uppers, "$" or ","
    def getBareCategory(cat)
      cat.sub(/\-arg.*$/, '').scan( /[A-Z\$\,]+/ )[-1]
    end
    
    def hdwdbattle
      # use Hdwdrules and special NP case to determine which binary branch gets the head
      bald = getBareCategory(@uschead)  # bald= a bare head
      
      if Hdwdrules.key?(bald) && @uschead!="PP-tmp"
        okhdlist =  Hdwdrules.fetch(bald).split
        seekdirection = okhdlist.shift
        
        if okhdlist.empty?
          if seekdirection=="1" 
            #get the right corner unless it's a punctuation mark
            return return_r_child()
          else #seekdirection=="0"
            #get the left corner
            return return_l_child()
          end
          
        else
          
          if seekdirection=="1"  ###### IS THIS A BUG IN THE ORIGINAL SCRIPT? ##############
#          if seekdirection=="0"
            #check left to right
            okhdlist.each { |okhd| 
              children[0].head.split('_').each { |okhdusc|
                if getBareCategory(okhdusc) == okhd
                  #		$stderr.print "  ReturnLtoR :  Head = "+@head+", Hdwd will become = "+@children[0].hdwd+"\n"
                  return return_l_child() # @children[0].hdwd
                end
              }
              children[1].head.split('_').each { |okhdusc|
                if getBareCategory(okhdusc) == okhd
                  #		$stderr.print "  ReturnLtoR :  Head = "+@head+", Hdwd will become = "+@children[0].hdwd+"\n"
                  return return_r_child() # @children[1].hdwd
                end
              }
            }
          else #seekdirection=="0"
            #check right to left
            okhdlist.each { |okhd| 
              children[1].head.split('_').reverse_each { |okhdusc|
                if getBareCategory(okhdusc) == okhd
                  #		$stderr.print "  ReturnRtoL :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
                  return return_r_child() # @children[1].hdwd
                end
              }
              children[0].head.split('_').reverse_each { |okhdusc|
                if getBareCategory(okhdusc) == okhd
                  #		$stderr.print "  ReturnRtoL :  Head = "+@head+", Hdwd will become = "+@children[0].hdwd+"\n"
                  return return_l_child() # @children[0].hdwd
                end
              }
            }
            return last_resort
          end
          
        end
        
      elsif bald=="NP" || bald=="NX" || @uschead == "PP-tmp" #NP is treated as a special case; see Collins' notes (magerman-black.txt)
        
        #1. identify POS words -- SKIP this Collins step
        #2. R to L for first child which is NN, NNP, NNPS, NNS, NX, POS, JJR
        npcond1 = ["NN","NNP","NNPS","NNS","NX","POS","JJR"]
        npcond1.each { |okhd| 
          if getBareCategory(@children[1].head) == okhd
            #	  $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
            return return_r_child() # @children[1].hdwd
          end
        }
        #3. L to R for first child which is NP
        npcond2 = ["NP"]
        npcond2.each { |okhd| 
          if getBareCategory(@children[0].head) == okhd
            #	  $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[0].hdwd+"\n"
            return return_l_child() # @children[0].hdwd
          end
        }
        #4. R to L
        npcond3 = ["$","ADJP","PRN"]
        npcond3.each { |okhd| 
          if getBareCategory(@children[1].head) == okhd
            #	  $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
            return return_r_child() # @children[1].hdwd
          end
        }
        #5. R to L
        npcond4 = ["CD"]
        npcond4.each { |okhd| 
          if getBareCategory(@children[1].head) == okhd
            #	  $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
            return return_r_child() # @children[1].hdwd
          end
        }
        #6. R to L
        npcond5 = ["JJ","JJS","RB","QP"]
        npcond5.each { |okhd| 
          if getBareCategory(@children[1].head) == okhd
            #	  $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
            return return_r_child() # @children[1].hdwd
          end
        }
        #      $stderr.print "  Returning :  Head = "+@head+", Hdwd will become = "+@children[1].hdwd+"\n"
        return return_r_child() # @children[1].hdwd
      end
      
      
      return last_resort# @children[0].hdwd
    end
    
    def last_resort
      lastresort = ""
      pick = ""
      Lastrules.each{ |letter| 
        if @children[0].head.match(/^#{$semLabel}(#{$pbrLabel})?#{letter}/)
          lastresort = @children[0].hdwd
          pick = @children[0].head 
#          @children[0].head = "h:"+@children[0].head
#          @children[1].head = "m:"+@children[1].head
          @children[0].setHead
          @children[1].setModifier
          break
        elsif @children[1].head.match(/^#{$semLabel}(#{$pbrLabel})?#{letter}/)
          lastresort = @children[1].hdwd
          pick = @children[1].head
#          @children[1].head = "h:"+@children[1].head
#          @children[0].head = "m:"+@children[0].head
          @children[1].setHead
          @children[0].setModifier          
          break
        end
      } 
      if pick=="" && lastresort=="" 
        lastresort = @children[0].hdwd
        pick = @children[0].head
#        @children[0].head = "h:"+@children[0].head
#        @children[1].head = "m:"+@children[1].head
          @children[0].setHead
          @children[1].setModifier      
      end
      if $options[:verbose]
        $stderr.print "Unable to resolve hdwd for "+@head+" -> "+@children[0].head+" "+@children[1].head+", picked "+pick+"\n"
      end
      return lastresort
    end
    
    def return_r_child
      if @children[1].head.match(/[A-Za-z]/) && @children[1].head[1]!='!' # @children[1].hdwd.match(/[A-Za-z]/) && 
#          @children[1].head = "h:"+@children[1].head
#          @children[0].head = "m:"+@children[0].head
        @children[1].setHead
        @children[0].setModifier
        return @children[1].hdwd
        
      elsif @children[0].head.match(/[A-Za-z]/) && @children[1].head[1]!='!' # @children[0].hdwd.match(/[A-Za-z]/) && 
#          @children[0].head = "h:"+@children[0].head
#          @children[1].head = "m:"+@children[1].head
        @children[0].setHead
        @children[1].setModifier
        return @children[0].hdwd
                
      else
        return "-"
      end
    end
    
    def return_l_child
      if @children[0].head.match(/[A-Za-z]/) && @children[1].head[1]!='!' # @children[0].hdwd.match(/[A-Za-z]/) && 
#        @children[0].head = "h:"+@children[0].head
#        @children[1].head = "m:"+@children[1].head
        @children[0].setHead
        @children[1].setModifier
        return @children[0].hdwd
        
      elsif @children[1].head.match(/[A-Za-z]/) && @children[1].head[1]!='!' # @children[1].hdwd.match(/[A-Za-z]/) && 
#        @children[1].head = "h:"+@children[1].head
#        @children[0].head = "m:"+@children[0].head
        @children[1].setHead
        @children[0].setModifier
        return @children[1].hdwd
      else
        return "-"
      end
    end
    
    def hdwd_to_s
      if @children.length == 0
        return "#{@hdwd}"
      else
        
        s = "(#{@head}{#{@hdwd}} "
        @children.each{ |child|
          s += child.hdwd_to_s
          #print "#{child.hdwd}"
          s += " "
          ####no children, print terminal headword
          #if child.children.size==0
          #  s += "#"+child
          #end
        }
        s += ")"
      end
      return s
    end
    
    def to_unk_s
      if @children.length == 0
        if $hdwdfreqs[@hdwd] && $options[:cut]!=0
          headword = $hdwdfreqs[@hdwd]>$options[:cut] ? @hdwd : "unk"
        elsif $hdwdfreqs[@hdwd] && $options[:best]!=-1
          headword = $hdwdranks[@hdwd]<$options[:best] ? @hdwd : "unk"
        else
          headword = @hdwd
        end
        return "#{headword}"
      else
        if $hdwdfreqs[@hdwd] && $options[:cut]!=0
          headword = $hdwdfreqs[@hdwd]>$options[:cut] ? @hdwd : "unk"
        elsif $hdwdfreqs[@hdwd] && $options[:best]!=-1
          headword = $hdwdranks[@hdwd]<$options[:best] ? @hdwd : "unk"
        else
          headword = @hdwd
        end
        s = "(#{@head}{#{headword}} "
        @children.each{ |child|
          s += child.to_unk_s
          s += " "
        }
        s += ")"
      end
      return s
    end
    
    def set_parents
      if @children.size==0
        $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + "\nLine=#{$line}\n"
        return
      end
      if @children.size==1 && @children[0].children.size==1 && @children[0].children[0].children.size!=0
        $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + " child:"+@children[0].head + " granch:"+@children[0].children[0].head + "\n"
        return
      end
      
      ## terminal case
      if @children.size==1 && @children[0].children.size==0
        return
      end
      
      ## unary case
      if @children.size==1
        ## recurse to left (or unary) child...
        @children[0].set_parents
        @children[0].parent = self
        return
      end
      
      ## binary case
      if @children.size==2
        if @head.include?('_')
         (preusc,postusc) = @head.split('_',2)
          #determine which binary branch has the head
        else
          
          ## recurse to left (or unary) child...
          @children[0].set_parents
          @children[0].parent = self
          ## recurse to right child...
          @children[1].set_parents
          @children[1].parent = self
          
          return
        end
        
        if @children.size>2 || @children.size<0
          $stderr.print "ERROR: calcHdwdTree.rb requires binary trees. Node "+@head+" has "+@children.size.to_s+" children.\n"
        end
        #    $stderr.print "finishing up" + "\n"
      end
      
    end
    
  end
  
  ##########################################
  
  lines = Array.new
  
#  File.open("ttt").each_line do |$line| 
  while ($line = STDIN.gets)
    
    t = Tree.new($line)
    t.parent = Tree.new()
    t.parent.head = $ROOTOFTREE
    t.parent.hdwd = '-'
    t.set_parents
    t.percolate()
    
    if $options[:cut]==0 && $options[:best]==-1
      #stream processing (normal)
      print t.hdwd_to_s + "\n"
    else
      #batch processing (if need to count freq for cutoffs)
      lines.push($line)
    end
    
    $ctr = $ctr+1
    if $ctr % 1000 == 0
      $stderr.print " ... found headwords for #{$ctr} trees ...\n"
    end
  end
  
  ## second pass, for making things "unk"
  if $options[:cut]!=0 || $options[:best]!=-1
    if $options[:best]
      sortedhdwdfreqs = $hdwdfreqs.sort {|a,b| b[1]<=>a[1]}
      sortedhdwdfreqs.each_index {|i|
        $hdwdranks[sortedhdwdfreqs[i][0]] = i
      }
    end
    
    $stderr.print " ... finished first pass, making second for UNKs...\n"
    
    if $options[:best]!=0
      $hdwdfreqs["eos"] = $options[:cut]+1
      $hdwdranks["eos"] = $options[:best]-1
    end
    
    $ctr = 0
    lines.each{ |line|
			$line = line
      t = Tree.new($line)
      t.parent = Tree.new()
      t.parent.head = $ROOTOFTREE
      t.parent.hdwd = '-'
      t.set_parents
      t.percolate()
      
      print t.to_unk_s + "\n"
      
      $ctr = $ctr+1
      if $ctr % 1000 == 0
        $stderr.print " ... found headwords for #{$ctr} trees ...\n"
      end
    }
  end
