###############################################################################
##                                                                           ##
## This file is part of ModelBlocks. Copyright 2009, ModelBlocks developers. ##
##                                                                           ##
##    ModelBlocks is free software: you can redistribute it and/or modify    ##
##    it under the terms of the GNU General Public License as published by   ##
##    the Free Software Foundation, either version 3 of the License, or      ##
##    (at your option) any later version.                                    ##
##                                                                           ##
##    ModelBlocks is distributed in the hope that it will be useful,         ##
##    but WITHOUT ANY WARRANTY; without even the implied warranty of         ##
##    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the          ##
##    GNU General Public License for more details.                           ##
##                                                                           ##
##    You should have received a copy of the GNU General Public License      ##
##    along with ModelBlocks.  If not, see <http://www.gnu.org/licenses/>.   ##
##                                                                           ##
###############################################################################

#!/usr/bin/perl
#########################
# rules2binaryrules.pl
#
# Takes in a set of rules in the style created by Lane's tree_to_rule.rb
# script, and binarizes them using Magerman head rules.
#########################

use strict;

sub getMainCat {
  my $cat = shift;
  my $ind = index($cat, "-");
  if( $ind <= 0 ){
    return $cat;
  }else{
    return substr($cat, 0, $ind);
  }
}

my $mhr = "/project/nlp/tmill/wordgen/model/head_rules.txt";

if($ARGV[0] ne ""){
  print stderr "Using file $ARGV[0] for head rules\n";
  $mhr = $ARGV[0];
}
my %head_start = ();
my %head_list = ();

die "Could not find model file\n" if not -e $mhr;

## reading in head rules table
open HEAD_TABLE, "< $mhr";
while(<HEAD_TABLE>){
  my @fields = split /\s+/, $_;
  my $nt = $fields[0];
  $head_start{$nt} = $fields[1];
  ## Now get the Non-terminal and the left/right specifier off the array...
  shift @fields;
  shift @fields;
  ## ... and add the list to the 
  $head_list{$nt} = \@fields;
}


## Now we'll look through all our sentences' rules.
my @rule_list = ();
my $rule;

while(1){
  ## Try to read from the rule list, then stdin, otherwise quit.
  if((not ($rule = shift @rule_list)) && (not ($rule = <STDIN>))){
    last;
  }
  #print "$rule";
  chomp($rule);
  if($rule =~ m/SENTENCE/){
    print "$rule\n";
    next;
  }
  my ($sent_num) = ($rule =~ m/Sentence (\d+)/);
  
  ## First look if its binary or unary branching (most cases)
  my $temp = $rule;
  $temp =~ s/\(\d+\)//g;
  my @fields = split /\s+/, $temp;

  ## Done variable allows us to exit out of 2 for-loops
  my $done = 0;
  
  if($#fields <= 5){
    print "$rule\n";
  }else{
    ## Get the root of the rule (fields 0 and 1 are the word "sentence" and the number)
    my $lhs = getMainCat($fields[2]);
    my ($full_lhs, $rhs) = ($rule =~ m/(.*=>) (.*)$/);
    ## Make sure this item is in the table
    if(!defined($head_list{$lhs})){
#      print stderr "Could not find $lhs in the table, printing out rule as is and skipping...\n";
      print "$rule\n";
      next;
    }
    ## Shift off the first 4 fields ("Sentence", number, double arrow, lhs)
    shift @fields; shift @fields; shift @fields; shift @fields;
    
    ## First, Magerman treats NP as a special case:
    if($lhs eq "NP"){
      ## Starting with the right-most child and working left...
      for (my $ind = $#fields; $ind >= 0; $ind--){ 
        ## Look for the first child with cat starting with the letter "N"
        if(index($fields[$ind], "N") == 0){
          ## Found a child with that description...
          $done = 1;
          if($ind == $#fields){
            my ($num_left, $cat_left, $num_mid, $num_right) = 
                    ($rhs =~ m/\((\d+)\) ([^ ]+) \((\d+)\) \Q$fields[-1]\E \((\d+)\)$/);
            print "Sentence $sent_num: ($num_left) $fields[-1] ($num_right) => ($num_left) $cat_left ($num_mid) $fields[-1] ($num_right)\n";
            my $new_rule = "$full_lhs " . $` . "($num_left) $fields[-1] ($num_right)"; #`
            unshift @rule_list, $new_rule;
            last;
          }else{
            my ($num_left, $num_mid, $cat_right, $num_right) = 
                    ($rhs =~ m/\((\d+)\) \Q$fields[$ind]\E \((\d+)\) ([^ ]+) \((\d+)\)/);
            print "Sentence $sent_num: ($num_left) $fields[$ind] ($num_right) => ($num_left) $fields[$ind] ($num_mid) $cat_right ($num_right)\n";
            my $new_rule = "$full_lhs " . $` . "($num_left) $fields[$ind] ($num_right)" . $'; #` 
            unshift @rule_list, $new_rule;
            last;
          }
        }
      }
      if($done == 1){
        next;
      }
    }
    ## Now do 
    my @list = $head_list{$lhs};
    foreach my $child (@{$head_list{$lhs}}){
      if($head_start{$lhs} eq "left"){
        # check the first n-1 children, if they are the right category, merge them
        # with the cat to their right.
        foreach my $ind (0..$#fields-1){
          if(getMainCat($fields[$ind]) eq $child || $child eq "**"){
            my $new_child = getMainCat($fields[$ind]);
            $done = 1;
            my ($num_left, $num_mid, $cat_right, $num_right) = 
                    ($rhs =~ m/\((\d+)\) \Q$fields[$ind]\E \((\d+)\) ([^ ]+) \((\d+)\)/);
            print "Sentence $sent_num: ($num_left) $new_child ($num_right) => ($num_left) $fields[$ind] ($num_mid) $cat_right ($num_right)\n";
            my $new_rule = "$full_lhs " . $` . "($num_left) $new_child ($num_right)" . $'; #This comment is for my editor's highlighting rules - the following backtick is important.` 
            unshift @rule_list, $new_rule;
            last;
          }
        }
        if($done == 1) {
          last;
        }
        
        # check the n_th child, and merge to the left if the same
        if(getMainCat($fields[-1]) eq $child){
          $done = 1;
          my ($num_left, $cat_left, $num_mid, $num_right) = 
                    ($rhs =~ m/\((\d+)\) ([^ ]+) \((\d+)\) \Q$fields[-1]\E \((\d+)\)$/);
          print "Sentence $sent_num: ($num_left) $child ($num_right) => ($num_left) $cat_left ($num_mid) $fields[-1] ($num_right)\n";
          my $new_rule = "$full_lhs " . $` . "($num_left) $child ($num_right)"; #`
          unshift @rule_list, $new_rule;
        }
        if($done == 1){
          last;
        }
      }elsif($head_start{$lhs} eq "right"){
        ## Check the n_th child, merge to the left if the same
        if(getMainCat($fields[-1]) eq $child || $child eq "**"){
          my $new_child = getMainCat($fields[-1]);
          my ($num_left, $cat_left, $num_mid, $num_right) = 
                    ($rhs =~ m/\((\d+)\) ([^ ]+) \((\d+)\) \Q$fields[-1]\E \((\d+)\)$/);
          print "Sentence $sent_num: ($num_left) $new_child ($num_right) => ($num_left) $cat_left ($num_mid) $fields[-1] ($num_right)\n";
          my $new_rule = "$full_lhs " . $` . "($num_left) $new_child ($num_right)"; #`
          unshift @rule_list, $new_rule;
          $done = 1;
        }
        if($done == 1){
          last;
        }
        ## Check the children going backwards
        for(my $ind = $#fields-1; $ind >= 0; $ind--){
          my $main_cat = getMainCat($fields[$ind]);
          if($child eq $main_cat){
            ## Here we have to be careful in case there is more than one match... normally
            ## perl would give us the first match (left to right), but this condition
            ## by definition we want the last match (left to right), or first from the 
            ## right.  Using the m//g regexp construction w/o any regexp memory gives a 
            ## list of matching strings, so we just re-run the regexp on the last string
            ## in the list to get our variables of interest.
            my @match_list = ($rhs =~ /\(\d+\) \Q$fields[$ind]\E \(\d+\) [^ ]+ \(\d+\)/g);
            my $old_lhs = $`; #editor `
            my $old_rhs = $'; 
            my $last_match = $match_list[-1];
            my ($num_left, $num_mid, $cat_right, $num_right) = 
                    ($last_match =~ m/\((\d+)\) \Q$fields[$ind]\E \((\d+)\) ([^ ]+) \((\d+)\)/);
            ## After this point its the same as above (could functionize)
            print "Sentence $sent_num: ($num_left) $child ($num_right) => ($num_left) $fields[$ind] ($num_mid) $cat_right ($num_right)\n";
            my $new_rule = "$full_lhs $old_lhs($num_left) $child ($num_right)$old_rhs"; 
            unshift @rule_list, $new_rule;           
            $done = 1;
            last;
          }
        }
        if($done == 1){
          last;
        }
      }
    }
   ## If we get here, we didn't find any way to shorten the rule...
#    print "done = $done\n";
    if($done == 0){
      print "$rule\n";
    }
   
  }
}

