###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/ruby

#####################################################################
# trees2dat-gf
# This script implements a tree to model version of the HHMM parser
# (rc trees mapped to operations in the HHMM).  See Schuler 2009 (NAACL)
# for model architecture.
#
# Author: Tim Miller
# Author: William Schuler
#
######################################################################

require "scripts/umnlp.rb"

$error = false
$line = ""

class Tree
  def toDat ( depth, qParent, numSibs=1 )

    if @children.size==0
      $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + "\nLine=#{$line}\n"
      return
    end
    if @children.size==1 && @children[0].children.size==1 && @children[0].children[0].children.size!=0
      $stderr.print "ERROR: EXPECT (PRETERM), POS, WORD NODES IN RCTREE!!! curr:"+@head + " child:"+@children[0].head + " granch:"+@children[0].children[0].head + "\n"
      return
    end

    gParentAwa = qParent.gsub(/.*\/(.*)/,'\1')
    ## terminal case, left child post-transform
    if @children.size==1 && @children[0].children.size==1 && @children[0].children[0].children.size==0
      pos = @children[0].children[0].head.gsub(/([^\#]*)\#.*/,'\1')
      if depth<5
        arg = getArg(@children[0].children[0].head)
        if arg.nil?
          #$stderr.print "#{$lineNum} Missing relations or arguments label for: #{children[0].children[0].head} (case 1)\n"
          arg = getArg(getLeftMostChildOfRightSibling(@parent).head)
          if arg.nil?
            $stderr.print "#{$lineNum} Missing relations or arguments again #{getLeftMostChildOfRightSibling(@parent).head} (case 1). Use NULL for now\n"
            print "Ge " + depth.to_s + " " + gParentAwa + " " + "NULL" + " : " + @children[0].head + "\n" 
          else
            print "Ge " + depth.to_s + " " + gParentAwa + " " + arg + " : " + @children[0].head + "\n" 
          end
        else
          print "Ge " + depth.to_s + " " + gParentAwa + " " + arg + " : " + @children[0].head + "\n" 
        end
        leftMostChildOfRightSibling = getLeftMostChildOfRightSibling(@parent)
        argForGtpGtq = getArg(leftMostChildOfRightSibling.head)
        if argForGtpGtq.nil?
          #$stderr.print "#{$lineNum} Missing relations or arguments label for: #{leftMostChildOfRightSibling.head} (case 1.1)\n"
          if arg.nil?
            $stderr.print "#{$lineNum} Missing relations or arguments again: #{@children[0].children[0].head} (case 1.1). Use NULL for now\n"
            print "Gtp " + depth.to_s + " " + @children[0].head + " " + gParentAwa + " " + "NULL" + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
            print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].head + " " + "NULL" + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
          else
            print "Gtp " + depth.to_s + " " + @children[0].head + " " + gParentAwa + " " + arg + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
            print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].head + " " + arg + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
          end
        else
          print "Gtp " + depth.to_s + " " + @children[0].head + " " + gParentAwa + " " + argForGtpGtq + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
          print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].head + " " + argForGtpGtq + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
        end
        print "Pg " + @children[0].head + " : " + pos + "\n"
        print "Fr " + depth.to_s + " " + @children[0].head + " " + gParentAwa + " " + $argArray[$argIdx] + " : 1\n"
        print "A " + depth.to_s + " " + $rel + " " + $argArray[$argIdx] + " " + @children[0].head + " : " + $argArray[$argIdx+1] + "\n"
        $argIdx = $argIdx+1
        
      end
      ## if beyond max depth...
      if depth>=5
        $stderr.print "ERROR: TREE TOO BIG!!!\n"
      end
      #word = @children[0].children[0].head.gsub(/.*\#(.*)/,'\1')
      #print "Pw " + word + " : " + pos + "\n"
      #if word == ""
      #  $error = true
      #end
      return

    ## terminal case, right child post-transform
    elsif @children.size==1 && @children[0].children.size==0
      ## make sure head label equal to pos...
      pos = @children[0].head.gsub(/([^\#]*)\#.*/,'\1')
      arg = getArg(@children[0].head)
      #if depth<5
      if arg.nil?
          #$stderr.print "Missing relations or arguments label for: #{children[0].head} (case Ge 1). Try arg from previous word\n"
          if @parent.children[1] == self   #this is the right child
            arg = getArg(getRightMostChild(@parent.children[0]).head)
            if arg.nil?
              $stderr.print "Missing relations or arguments again: #{getRightMostChild(@parent.children[0]).head} (case Ge 1). Two consecutive traces. Use NULL for now\n"
              print "Ge " + depth.to_s + " " + gParentAwa + " " + "NULL" + " : " + @head + "\n" 
            else
              print "Ge " + depth.to_s + " " + gParentAwa + " " + arg + " : " + @head + "\n" 
            end
          else
            $stderr.print "Missing relations or arguments again: Not the right-child (case Ge 1). Give up for now\n"
          end
      else
        print "Ge " + depth.to_s + " " + gParentAwa + " " + arg + " : " + @head + "\n" 
      end
        print "Pg " + @head + " : " + pos + "\n"
        if @parent == nil || @parent.head =~ /\//
          print "Fr " + depth.to_s + " " + @head + " " + gParentAwa + " " + $argArray[$argIdx] + " : " + @head + "\n"
          print "A " + depth.to_s + " " + $rel + " " + $argArray[$argIdx] + " " + @head + " : " + $argArray[$argIdx+1] + "\n"
          $argIdx = $argIdx+1
        end
        
      #elsif depth==5
      #  print "Ge5 " + " " + qParent + " : " + @head + "\n"
      #  print "Pg " + @head + " : " + pos + "\n"
      #end

      #word = @children[0].head.gsub(/.*\#(.*)/,'\1')
      #print "Pw " + word + " : " + pos + "\n"
      #if word == ""
      #  $error = true
      #end
      return

    ## unary case
    elsif @children.size==1
      ## recurse to unary child...
      @children[0].toDat(depth,qParent,0)
      print "Fr " + depth.to_s + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " " + $argArray[$argIdx] + " : 1\n"
      print "A " + depth.to_s + " " + $rel + " " + $argArray[$argIdx] + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " : " + $argArray[$argIdx+1] + "\n"
      $argIdx = $argIdx+1
      
      leftMostChildOfRightSibling = getLeftMostChildOfRightSibling(@parent)
      arg = getArg(leftMostChildOfRightSibling.head)
      if arg.nil?
        #$stderr.print "#{$lineNum} Missing relations or arguments label for: #{leftMostChildOfRightSibling.head} (case 3)\n"
        arg = getArg(getRightMostChild(@children[0]).head)
				if arg.nil?
				  $stderr.print "#{$lineNum} Missing relations or arguments again: #{getRightMostChild(@children[0]).head} (case 3). Two consecutive traces. Use NULL for now.\n"
          print "Gtp " + depth.to_s + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " " + "NULL" + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
          print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + "NULL" + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
        else
          print "Gtp " + depth.to_s + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " " + arg + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
          print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + arg + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"				
				end
      else
        print "Gtp " + depth.to_s + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " " + arg + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
        print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + arg + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
      end
#      print "Gtp " + depth.to_s + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " : " + @head.gsub(/(.*)\/.*/,'\1') + "\n"
#      print "Gtq " + depth.to_s + " " + @head.gsub(/(.*)\/.*/,'\1') + " " + @children[0].children[0].head.gsub(/(.*)\/.*/,'\1') + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
      return

    ## binary case remaining incomplete
    elsif @children.size==2 && @head=~/\//
      ## recurse to left child...
      @children[0].toDat(depth,qParent)
      ## recurse to right child...
      @children[1].toDat(depth+1,@children[0].head)
      #print "Fr " + depth.to_s + " " + @children[0].head.gsub(/(.*)\/.*/,'\1') + " " + qParent.gsub(/.*\/(.*)/,'\1') + " : 0\n"
      leftMostChildOfRightSibling = getLeftMostChildOfRightSibling(@parent)
      arg = getArg(leftMostChildOfRightSibling.head)
      if arg.nil?
        #$stderr.print "Missing relations or arguments label for: #{leftMostChildOfRightSibling.head} (case Gtm). Try the right-most child\n"
        rightMostChild = getRightMostChild(self);
	arg = getArg(rightMostChild.head)
        if arg.nil?
          $stderr.print "Missing relations or arguments again: #{rightMostChild.head} (case Gtm). 2 consecutive traces???\n"
        else
          print "Gtm " + depth.to_s + " " + @children[1].head + " " + @children[0].head.gsub(/.*\/(.*)/,'\1') + " " + arg + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n" 
        end
      else
        print "Gtm " + depth.to_s + " " + @children[1].head + " " + @children[0].head.gsub(/.*\/(.*)/,'\1') + " " + arg + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n" 
      end
#      print "Gtm " + depth.to_s + " " + @children[1].head + " " + @children[0].head.gsub(/.*\/(.*)/,'\1') + " : " + @head.gsub(/.*\/(.*)/,'\1') + "\n"
      return

    ## binary case becoming complete
    elsif @children.size==2 && @head!~/\//
      ## recurse to left child...
      @children[0].toDat(depth,qParent)
      ## recurse to right child...
      @children[1].toDat(depth+1,@children[0].head)
      if numSibs > 0
        print "Fr " + depth.to_s + " " + @children[0].head.gsub(/(.*)\/.*/,'\1') + " " + gParentAwa + " " + $argArray[$argIdx] + " : " + @head + "\n"
        print "A " + depth.to_s + " " + $rel + " " + $argArray[$argIdx] + " " + @head + " : " + $argArray[$argIdx+1] + "\n"
        $argIdx = $argIdx+1
      end
      return
    end

    $stderr.print "ERROR: N-ARY BRANCH IN RCTREE!!!\n"
    return

  end

  private
  
  #given the parent of t, return the left most child of the right sibling of t
  def getLeftMostChildOfRightSibling(p)
    leftMostChildOfRightSibling = p.children[1].children[0]
    while leftMostChildOfRightSibling.children.size > 0
      leftMostChildOfRightSibling = leftMostChildOfRightSibling.children[0]
    end
    return leftMostChildOfRightSibling
  end
  
  #get the right most child of tree rooted at t
  def getRightMostChild(t)
    rightMostChild = t
    while rightMostChild.children.size > 0
      if rightMostChild.children.size == 2
        rightMostChild = rightMostChild.children[1]
      else
        rightMostChild = rightMostChild.children[0]
      end
    end
    return rightMostChild
  end
  
  #given the parent of t, return the left most child of the right sibling of t
  def getLeftMostChildOfRightSibling(p)
    leftMostChildOfRightSibling = p.children[1].children[0]
    while leftMostChildOfRightSibling.children.size > 0
      leftMostChildOfRightSibling = leftMostChildOfRightSibling.children[0]
    end
    return leftMostChildOfRightSibling
  end
  
  def getArgWhenThereIsVerbFeatures(term)
    # args if existed is after the ":"; else return nil
    args = []
    if term.split(/\:/, 2).size > 1
      arg = term.split(/\:/, 2)[1]
      if arg.match(/REL\-[^\-]*\-/)        #it's a REL, not arg
        arg.slice!(-6, 6);     #chop off the verb features, e.g. -P--3A, 5 features plus the leading "-"
      end
      args[0] = arg
    end
    return args
  end

  def getArg(term)
    # arg if existed is after the ":"; else return nil
    arg = nil
    wAnda = term.split(":")
    if wAnda.size > 1
      arg = wAnda[wAnda.length - 1]
      if arg =~ /^REL\-.*/
#        rel = arg.slice(/[^\-]*$/)
#        verb = wAnda[0].slice(/[^#]*$/)
#        print "Cj " + rel + " : " + verb + "\n"
        arg = "REL"
      elsif ($MAIN_ARG_ONLY)
        #arg = arg[0,5] #cut off specifics on numbered arg but keep argm. E.g. ARG0-TO become just ARG0, but ARGM-TMP stay as is 
	arg.sub!( /ARG(\d)\-.*$/, 'ARG\1' );
      elsif ($NO_EXTENSION)
        #arg = arg[0,5] #cut off specifics on both numbered arg and argm. E.g. ARG0-TO becomes just ARG0 and ARGM-TMP becomes just ARGM
	arg.sub!( /ARG([^\-])\-.*$/, 'ARG\1' );
      end
    end
    return arg
  end


  #This method split the 5 verb features separately to create multiple Ge records. 
  #Not used for now.
  def getArgsWithFeaturesSplited(term)
    # args if existed is after the ":"; else return nil
    args = []
    arg = term.split(/\:/, 2)
    if arg.size > 1
      all5Args = arg[1]
      if ("-----".equal?(all5Args) || !all5Args.match(/REL\-/) )
        args[0] = all5Args
      else
        relAndFrameset = all5Args.slice!(/REL\-[^\-]*\-/)
        for idx in 0..4
          if all5Args[idx] != 45 #the dash char "-"
            a = relAndFrameset
            idx.times { |n| a += "-" }
            a += all5Args[idx].chr
            (4-idx).times { |n| a += "-" }
            args.push(a)
          end
        end
      end
    end
    return args
  end
end

  
def getRel(line)
  #E.g. [0] : #carry:REL-carry.03
  #     [1] : carry
  #     [2] : carry.03
  matches = line.match(/#([^ ]*):REL-([^)]*)/)
  if matches == nil
    $stderr.puts "No rel ", line, "\n"
    return nil
  end
#  print "Cj " + matches[2] + " : " + matches[1] + "\n"
  return [matches[2], matches[1]]    
end

def printClusters
  $clusters.each do |cluster| 
    cluster[0].each { |arg| print arg, " " } 
    data_points = ""
    sum_data_points = 0
    cluster[1].each do |rel_conj|
      data_points += "[" + rel_conj[0][0] + " " + rel_conj[0][1] + "]=" + rel_conj[1].to_s + " "
      sum_data_points += rel_conj[1]
    end
    
    print "(", cluster[1].length, ",", sum_data_points, "): " 
    print data_points
    print "\n"
  end
end


def cluster(line)
  rel_conj = getRel(line)
  if rel_conj != nil
    args = getArgArray(line)
    cluster_key = []
    curArg = ""
    args.each_index do |idx|
      if (args[idx] != curArg && args[idx] != "NULL")
        cluster_key.push(args[idx])
        curArg = args[idx]
      end
    end
    if ($clusters[cluster_key] == nil) 
      $clusters[cluster_key] = Hash.new(0)
    end
    $clusters[cluster_key][rel_conj] += 1    
  end
end

def getArgArray(line)
  if $MAIN_ARG_ONLY
    return line.scan(/ARG[0-5A]|ARGM[^\)]*|REL|NULL/)
  end
  if $NO_EXTENSION
    return line.scan(/ARG[0-5A]|ARGM|REL|NULL/)
  end
  return line.scan(/ARG[^)]*|REL|NULL/)
end

$MAIN_ARG_ONLY = false
$NO_EXTENSION = false
$clusters = Hash.new


ARGV.each do |a|
  if (a == "-m") 
    $MAIN_ARG_ONLY = true
    ARGV.pop
  end
  if (a == "-ne") 
    $NO_EXTENSION = true
    ARGV.pop
  end
end

#hack to run in ide
#f = File.open("test") or die "Unable to open file..."

$lineNum = 0
while($line = gets)
#f.each_line do |$line| 
#  $stderr.puts $lineNum;
  $lineNum += 1
  #ignore -empty#0 -empty-ctrl#0 -empty-tmp#0 -extrNP#. 85 sentences total
  next if $line =~ /\-empty(\-(ctrl|tmp))?#0/ or $line =~ /\-extrNP#/
  
  
  cluster($line)
  
end

# sort by which cluster contains more variety of rels, not neccessary the bigger total
$clusters = $clusters.sort{|a,b| b[1].length <=> a[1].length} 

printClusters
