#    This is a part of the "NED", a dependency parsing evaluation tool developed
#    in the Hebrew University by Roy Schwartz, Omri Abend, Roi Reichart and Ari Rappoport.
#    Copyright (C) 2011  Omri Abend, Roi Reichart and Ari Rappoport
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <http://www.gnu.org/licenses/>.
#
#    If you have any questions, feel free to contact us in:
#    roys02@cs.huji.ac.il, omria01@cs.huji.ac.il, roiri@csail.mit.edu
#
#    Please read the README file for more details.
#
#!/usr/local/bin/perl -w

use strict;
use ParseSentence;
use POS;
use Getopt::Long;
use IO::File;
use List::Util;


use constant INF => 100000;

# ROOT constant.
use constant ROOT => -1;

use constant X_DEFAULT_VAL => "0,2";
use constant Y_DEFAULT_VAL => "1,0";

use constant PUNCT_DFLT => "([\,\.\:\#\(\)]|\-[LR]RB\-|[\`\']{2}|LS|SYM||PUNCT)";


my ($maxLen, $xVal, $yVal, $punctRE, $if, $gold) =
		(INF, X_DEFAULT_VAL, Y_DEFAULT_VAL, PUNCT_DFLT);

my %convertWords = ('-LRB-' => '(', '-LCB-' => '(',
                                        '-RRB-' => ')', '-RCB-' => ')',
                                        '{'     => '(', '}' => ')');


my @xVals;
my @yVals;

sub main(@) {
	usage();

	@xVals = split(",", $xVal);
	@yVals = split(",", $yVal);

	my $ifh = new IO::File($if) or die "Can't open $if for reading";
	my $goldFH = new IO::File($gold) or die "Can't open $gold for reading";

	my $wordCounter = 1;
	my $lineNumber = 1;
	my $nSentences = 0;
	my $nNonPunct = 0;
	my @parents;
	my @goldPoses;
	my @goldParents;

	my $totalOrigDist = 0;
	my $totalParentDist = 0;
	my $totalSpecialDist = 0;
	my $totalTokens = 0;

	while (my $line = $ifh->getline()) {
		my $goldLine = $goldFH->getline();

		unless ($goldLine) {
			die "Inconsistent number of lines (test has line number $lineNumber while gold doesn't)";
		}

		chomp($goldLine);
		chomp($line);

		# Skip sentence.
		if ($goldLine =~ /^[ \t]*$/) {
			$nSentences++;
			if ($line !~ /^[ \t]*$/) {
				die "Gold line number $lineNumber is empty, while test line is not ($line)";
			}

			if ($nNonPunct <= $maxLen and $parents[0] !~ /\?\?/) {
				my $goldSentence = new ParseSentence(\@goldPoses, {CONTEND=>{}, DEP => {}}, $punctRE, 0);

				foreach my $i (0 .. @goldParents - 1) {
					$goldSentence->addPosParent($i+1, $goldParents[$i]);
				}

				treeBasedEval($goldSentence, \@parents,
							\$totalOrigDist, \$totalParentDist, \$totalSpecialDist, \$totalTokens, $nSentences);
			}

			$wordCounter = 1;
			@parents = @goldParents = @goldPoses = ();
			$nNonPunct = 0;
		} else {
			my ($goldWord, $goldPOS, $goldParent) = getParts($goldLine);
			my ($testWord, $testPOS, $testParent) = getParts($line);

			# Files are not synchronized.
			if (defined $goldWord and defined $testWord and lc($goldWord) ne lc($testWord)) {
				unless ($testPOS eq $goldPOS) {
					die "Inconsistency in line $lineNumber ($line, $goldLine): Gold word is '$goldWord' and test word is '$testWord'\n";
				}

			}

			if (defined $testPOS and ($testPOS eq "PUNCT" or $testPOS =~ /^$punctRE$/)) {
			} elsif (defined $goldPOS and $goldPOS =~ /^$punctRE$/) {
			} else {
				if (defined $goldPOS and defined $testWord and $goldPOS ne $testPOS) {
					die "Error in line $lineNumber: gold pos = $goldPOS, test pos = $testPOS\n";
				}
				$nNonPunct++;
			}

			unless (defined $goldParent) {
				die "Illegal gold line '$goldLine'";
			} elsif (not defined $testParent) {
				die "Illegal test line '$line'";
			}


			push(@goldPoses, $goldPOS);

			if ($goldParent == -1) {
				$goldParent = ROOT;
			}

			if ($testParent =~ /-1/) {
				$testParent = ROOT;
			}


			push(@parents, $testParent);
			push(@goldParents, $goldParent);

			$wordCounter++;
		}

		$lineNumber++;
	}

	$ifh->close();
	$goldFH->close();

	unless ($totalTokens) {
	  die "No sentences to evaluate.\n";
	}

	printf ("%d/%d (%.03f)\n", $totalTokens-$totalSpecialDist, $totalTokens, 1 - $totalSpecialDist/$totalTokens);

	return 0;
}

sub treeBasedEval($$$$$$) {
	my $goldSentence = shift;
	my $parents = shift;

	my $totalOrigDist = shift;
	my $totalParentDist = shift;
	my $totalSpecialDist = shift;
	my $totalTokens = shift;
	my $sentNum = shift;

	foreach my $i (0 .. @$parents - 1) {
		my $posObj = $goldSentence->posByIndex($i+1);

		if (defined ($posObj)) {
			$$totalTokens++;
			# Parents are not the same - error.
			if ($parents->[$i] != $posObj->parent()) {
				my ($origDist, $parentDist, $specialDist) = calcTreeDistance($goldSentence, $i+1, $parents->[$i], scalar(@$parents), $sentNum);
				$$totalOrigDist += $origDist;
				$$totalParentDist += ($parentDist > 0);
				$$totalSpecialDist += ($specialDist > 0);
			}
		}
	}
}


sub calcTreeDistance($$$$$) {
	my $goldSentence = shift;
	my $origIndex = shift;
	my $parentIndex = shift;
	my $sentLength = shift;
	my $sentNum = shift;

	my @origForFathers;

	my $forFather = $origIndex;
	my @parentsPoses;

	my $counter = $sentLength;

	while ($forFather != POS::ROOT) {
		unless ($counter--) {
		    die "Dependency loop in sentence number $sentNum - ".$goldSentence->toString()."\n";
		}
		
		push(@origForFathers, $forFather);
		my $data = $goldSentence->posByIndex($forFather)->data();

		if (defined $data) {
			$data =~ s/VB.?/V/;
			$data =~ s/NN([PS]|PS)?/N/;

			push(@parentsPoses, $data);
		}

		$forFather = $goldSentence->posByIndex($forFather)->parent();
	}

	push(@origForFathers, POS::ROOT);
	push(@parentsPoses, "ROOT");

	my $goldForFather = $parentIndex;

	my $nParentDist = 0;

	while (defined ($goldForFather) and $nParentDist < $sentLength) {
		foreach my $i (0 .. @origForFathers - 1) {
			if ($goldForFather == $origForFathers[$i]) {
				return ($i, $nParentDist, calcDistances($i, $nParentDist));
			}
		}

		$nParentDist++;

		$goldForFather = $goldSentence->posByIndex($goldForFather)->parent();
	}

	die "No common parent found ($origIndex, $parentIndex)";
}


sub calcDistances($$) {
	my $origDist = shift;
	my $parentDist = shift;

	foreach my $i (0 .. @xVals - 1) {
		return 0 if ($origDist == $xVals[$i] and $parentDist == $yVals[$i]);
	}

	return 1;
}


sub getParts($) {
	my $sentence = shift;

	my @elements = split("[ \t]+", $sentence);

	my ($word, $pos, $parent);

	if (@elements > 6) {
		($word, $pos, $parent) = ($elements[1], $elements[4], $elements[6]);
	} elsif (@elements == 3) {
		($word, $pos, $parent) = ($elements[0], $elements[1], $elements[2]);
	} elsif (@elements == 1) {
		($word, $pos, $parent) = (undef, undef, $elements[0]);
	} else {
		die "Illegal number of elements (".@elements.") - '@elements'";
	}

	unless ($parent) {
		$parent = ROOT;
	}

	if (defined $word and  exists $convertWords{$word}) {
		$word = $convertWords{$word};
	}

	return ($word, $pos, $parent);
}



sub usage() {
	my $usage;

	my $result = GetOptions ("if=s" => \$if,
			"gold=s"	=> \$gold,
			"ml=i"		=> \$maxLen,
			"h+"		=> \$usage
	);

    ### Sanity.
    if ($usage or not defined $if or not defined $gold) {
		die "A script which performs NED evaluation on a dependency parsed file.\n".
				"Usage: NED.pl\n".
				"-if <test file>\n".
				"-gold <gold standard file>\n".
				"-ml <Maximum sentence length to evaluate (default ${\INF})>\n".
				"-h - this message\n";
    }
}

exit (main(@ARGV));
