#!/usr/bin/perl -w

# createCoordFeatures.new.pl

# Shane Bergsma
# December 1, 2010

# Create features for the coordination examples that you yourself
# extracted from the PTB.

my $lexFeatsOn = 1;
my $contextFeatsOn = 1;
my $ngramFeatsOn = 1;
my $newNgramFeatures = 1;
die "Can't have new ones on without olds ones" if ($newNgramFeatures > $ngramFeatsOn);

my ($ngramCountFN) = @ARGV;

my %counts;
my $printLookupMode = 0;
if ($ngramFeatsOn) {
  if (!defined($ngramCountFN)) {
	print STDERR "No count file passed, printing n-grams to look-up instead.\n";
	$printLookupMode = 1;
  } else {
	loadCounts($ngramCountFN, \%counts);
  }
}

while (<STDIN>) {
  chomp;

  my ($label, $NPstring, $contextFeats, $bitextFeats) = split(/\t/, $_);
  die "bad line" unless ($NPstring =~ /^(.*) and\/CC (.*) ([^ ][^ ]*)$/);
  my ($preWordPair, @preOtherPairs) = reverse split(/ /, $1);
  # First, get out the pre word/tag:
  my ($preWord, $preTag) = split(/\//, $preWordPair);
  # Then get the words and tags of the others:
  # words:
  my @preOtherWords = @preOtherPairs;
  foreach (@preOtherWords) {
    s/\/.*//g; # strip out words
  }
  my $preOtherWrds = join "-", (reverse @preOtherWords);
  # tags:
  my @preOtherTags = @preOtherPairs;
  foreach (@preOtherTags) {
    s/.*\///g; # strip out tags
  }
  my $preOtherTgs = join "-", (reverse @preOtherTags);

  my ($postWordPair, @postOtherPairs) = reverse split(/ /, $2);
  my ($postWord, $postTag) = split(/\//, $postWordPair);
  # words:
  my @postOtherWords = @postOtherPairs;
  foreach (@postOtherWords) {
    s/\/.*//g; # strip out words
  }
  my $postOtherWrds = join "-", (reverse @postOtherWords);
  # tags:
  my @postOtherTags = @postOtherPairs;
  foreach (@postOtherTags) {
    s/.*\///g; # strip out tags
  }
  my $postOtherTgs = join "-", (reverse @postOtherTags);

  my ($headWord, $headTag) = split(/\//, $3);

  my $fv = "$label";

  if ($contextFeatsOn && !$printLookupMode) {
	# To do: divide this by tag and word?
	# Try subbing in the preWord alone
	$fv .= " $contextFeats";
  }

  if ($lexFeatsOn && !$printLookupMode) {
    my $fvString = "";
    $fvString .= " PREModsW=$preOtherWrds" if ($preOtherWrds ne "");
    $fvString .= " PREModsT=$preOtherTgs" if ($preOtherTgs ne "");
    $fvString .= " POSTModsW=$postOtherWrds" if ($postOtherWrds ne "");
    $fvString .= " POSTModsT=$postOtherTgs" if ($postOtherTgs ne "");

    $fvString .= " PreW=$preWord";
    $fvString .= " PostW=$postWord";
    $fvString .= " HeadW=$headWord";

    $fvString .= " PreT=$preTag";
    $fvString .= " PostT=$postTag";
    $fvString .= " PrePostT=$preTag^$postTag";
    $fvString .= " HeadT=$headTag";
    $fvString .= " PrePostHeadT=$preTag^$postTag^$headTag";
    $fvString .= " PreT=PostT" if ($preTag eq $postTag);
    $fvString .= " PreW=PostW" if ($preWord eq $postWord);

    # Record the pre and post, regardless of order:
    my $prePost = join "_", sort ($preWord, $postWord);
    $fvString .= " PrePostW=$prePost";

    $fvString =~ tr/:/;/;
    $fvString =~ tr/\#/|/;
    $fv .= "$fvString";
  }

  ########################################
  #
  # N-GRAM FEATURES
  #
  ########################################

  if ($ngramFeatsOn) {
	# This we use again and again:
	my $determinerRefList = ["the", "a", "an", "its", "his"];
	my $coordRefList = ["and", "or", ","];

    # Individuals:
    $fv .= getCountFeature(\%counts, "pr", "$preWord");
    $fv .= getCountFeature(\%counts, "po", "$postWord");
    $fv .= getCountFeature(\%counts, "h", "$headWord");

    # Now, here are some useful pairs:
    $fv .= getCountFeature(\%counts, "pr^h", "$preWord^$headWord");
    $fv .= getCountFeature(\%counts, "po^h", "$postWord^$headWord");

    # Conjunctions:
    $fv .= getOneTemplateCountFeature(\%counts, "pr^CC^po", "$preWord^#1^$postWord", $coordRefList);
    $fv .= getOneTemplateCountFeature(\%counts, "po^CC^pr", "$postWord^#1^$preWord", $coordRefList);

	# Knowing how common the whole thing is a very useful feature:
	$fv .= getCountFeature(\%counts, "all", "$preWord^and^$postWord^$headWord", "$preWord^and^$postOtherWrds^$postWord^$headWord");

	##### Nakov&Hearst Patterns:
	# Mod Reversed:
	$fv .= getOneTemplateCountFeature(\%counts, "po^CC^pr^h", "$postWord^#1^$preWord^$headWord", $coordRefList);
	# 2-unit reversed:
	$fv .= getOneTemplateCountFeature(\%counts, "po^h^CC^pr", "$postWord^$headWord^#1^$preWord", $coordRefList);
	# Explicit:
	$fv .= getOneTemplateCountFeature(\%counts, "pr^h^CC^po^h", "$preWord^$headWord^#1^$postWord^$headWord", $coordRefList);
	# Explicit reversed:
	$fv .= getOneTemplateCountFeature(\%counts, "po^h^CC^pr^h", "$postWord^$headWord^#1^$preWord^$headWord", $coordRefList);

	# To do:
	# Rus's interesting idea: try po^p1 vs po^h --> policeman and park guard --> park policeman should occur too.
	# Your interesting idea: try the context with n1 alone vs. n2 h alone"

	if ($newNgramFeatures) {
    # NEW: Conjunctions with commas -- a good sign!
	  $fv .= getOneTemplateCountFeature(\%counts, "pr,CC^po", "$preWord^,^#1^$postWord", ["and", "or"]);
	  $fv .= getOneTemplateCountFeature(\%counts, "po,CC^pr", "$postWord^,^#1^$preWord", ["and", "or"]);
	  $fv .= getOneTemplateCountFeature(\%counts, "pr^CC^po,", "$preWord^#1^$postWord^,", ["and", "or"]);
	  $fv .= getOneTemplateCountFeature(\%counts, "po^CC^pr,", "$postWord^#1^$preWord^,", ["and", "or"]);

	  # NEW: Conjunctions with determiners:
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^pr^CC^po", "#2^$preWord^#1^$postWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^po^CC^pr", "#2^$postWord^#1^$preWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "pr^CC^DT^po", "$preWord^#1^#2^$postWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "po^CC^DT^pr", "$postWord^#1^#2^$preWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^pr^and^DT^po", "#1^$preWord^and^#2^$postWord", $determinerRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^po^and^DT^pr", "#1^$postWord^and^#2^$preWord", $determinerRefList, $determinerRefList);

	  # NEW: Conjunction with the head -- you never did it!
	  $fv .= getOneTemplateCountFeature(\%counts, "pr^CC^h", "$preWord^#1^$headWord", $coordRefList);
	  $fv .= getOneTemplateCountFeature(\%counts, "h^CC^pr", "$headWord^#1^$preWord", $coordRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^pr^CC^h", "#2^$preWord^#1^$headWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^h^CC^pr", "#2^$headWord^#1^$preWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "pr^CC^DT^h", "$preWord^#1^#2^$headWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "h^CC^DT^pr", "$headWord^#1^#2^$preWord", $coordRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^h^and^DT^pr", "#1^$headWord^and^#2^$preWord", $determinerRefList, $determinerRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "DT^h^and^DT^po", "#1^$headWord^and^#2^$postWord", $determinerRefList, $determinerRefList);

	  # NEW: think about predictive contexts:
	  foreach $leftC ("with", "and", "as", "including", "on", "is", "are", "&") {
		$fv .= getCountFeature(\%counts, "$leftC+all", "$leftC^$preWord^and^$postWord^$headWord");
	  }
	  foreach $rightC ("and", "have", "of", "on", "said", "to", "were", "&") {
		$fv .= getCountFeature(\%counts, "all+$rightC", "$preWord^and^$postWord^$headWord^$rightC");
	  }

	  # Head-first paraphrases with prepositions:
	  my $prepRefList = ["of", "for", "in", "at", "on", "from", "with", "about"];
	  $fv .= getOneTemplateCountFeature(\%counts, "h^PREP^pr", "$headWord^#1^$preWord", $prepRefList);
	  $fv .= getOneTemplateCountFeature(\%counts, "h^PREP^po", "$headWord^#1^$postWord", $prepRefList);
	  $fv .= getTwoTemplateCountFeature(\%counts, "h^PREP^pr^CC^po", "$headWord^#1^$preWord^#2^$postWord", $prepRefList, $coordRefList);
	  #### $fv .= getTwoTemplateCountFeature(\%counts, "h^PREP^po^CC^pr", "$headWord^#1^$postWord^#2^$preWord", $prepRefList, $coordRefList);
	  # Actually, don't want to reverse po/pr, because: this could be both: costs/NNS and/CC exploration/NN spending/NN
	  # h^PREP^po^CC^pr = ... spending on exploration and costs ...
	}
  }

  print "$fv\n" unless $printLookupMode;
}

# load the counts in a particular file into a particular hash:
sub loadCounts {
  my $fileName = $_[0];
  my $countHashName = $_[1];

  open (COUNTS, $fileName) or die "I couldn't get at counts: $fileName";
  while (<COUNTS>) {
    chomp;
    next if /^NA$/;
    my ($ngram, $cnt, @rest) = split(/\t/,$_);
    ${$countHashName}{$ngram} = $cnt;
  }
}

sub getCountFeature {
  my ($countHashName, $label, @lookups) = @_;
  #  print STDERR "Looking up [@lookups] for $label\n";
  # Sum the counts over all the lookups:
  my $fvStr = "";
  my $tot = 0;
  foreach $lookup (@lookups) {
    $lookup =~ tr/^/ /;
	if ($printLookupMode) {
	  print "$lookup\n";
	} else {
	  my $cnt = ${$countHashName}{$lookup};
	  if (defined($cnt)) {
		$tot += $cnt;
	  }
	}
  }
  if (!$printLookupMode) {
	if ($tot > 0) {
	  $fvStr .= " CNT-$label:$tot";
	} else {
	  $fvStr .= " CNT-$label-UNDEF";
	}
  }
  return $fvStr;
}

# Rather than a list of lookups, give a template and let you create
# the list from arrays of 
sub getOneTemplateCountFeature {
  my ($countHashName, $label, $template, $fillerArray) = @_;
  # Build the set of lookups:
  my @lookups;
  my $numFillers = scalar(@{$fillerArray});
  for (my $i=0; $i<$numFillers; $i++) {
	my $filler = $fillerArray->[$i];
	my $tempTemplate = $template;
	$tempTemplate =~ s/#1/$filler/;
	push(@lookups, "$tempTemplate");
  }
  return getCountFeature($countHashName, $label, @lookups);
}

# Same as above, but with two holders and two filler lists
sub getTwoTemplateCountFeature {
  my ($countHashName, $label, $template, $fillerArray1, $fillerArray2) = @_;
  my @lookups;
  for (my $i=0; $i<scalar(@{$fillerArray1}); $i++) {
	for (my $j=0; $j<scalar(@{$fillerArray2}); $j++) {
	  my $filler1 = $fillerArray1->[$i];
	  my $filler2 = $fillerArray2->[$j];
	  my $tempTemplate = $template;
	  $tempTemplate =~ s/#1/$filler1/;
	  $tempTemplate =~ s/#2/$filler2/;
	  push(@lookups, "$tempTemplate");
	}
  }
  return getCountFeature($countHashName, $label, @lookups);
}
