#!/bin/bash
# Evaluates the correct prediction of MORPH links
# Compares the output of the parser in CONLL format with
# the reference annotation using the 1/2 prefix for MORPH/NON-MORPH links.
#
#
# It expects as arguments:
# 1. CONLL-formatted parsed sentences, including MORPH annotation
# 2. Same sentences, 1 per line, prefixed by "1" for MORPH, "2" for others
# 3. Dependant lemma of the MORPH link, e.g. adverb "encore" in "encore M> que"
# 4. Head lemma of the MORPH link, e.g. conjunction "que" in "encore M> que"
# Parameters 3 and 4 can be underspecified by using "*"

if [ $# -ne 4 ]; then 
    echo "usage: ./eval-morph.sh CONLL-FILE ANNOTATION-FILE DEPENDENT HEAD"
    exit
fi
conllfile=$1
annotfile=$2
dependant=$3
synhead=$4
morphnumber="MORPH"
nomorphnumber="OTHER"
pattern=pattern_${synhead}_${dependant}.xml
candidates=candidates_${synhead}_${dependant}.xml
auto=auto_${synhead}_${dependant}.txt
annot=annot_${synhead}_${dependant}.txt
compare=compare_${synhead}_${dependant}.txt

# ATTENTION: for clearer output, I'm redirectinf stderr to /dev/null for
# scripts. Bad for debug!

# First, create an extraction pattern that describes the target construction,
# that is, dependant depending on head by MORPH link
echo "1. Creating pattern file" > /dev/stderr
echo "<?xml version=\"1.0\" encoding=\"UTF-8\"?>
<!DOCTYPE patterns SYSTEM \"dtd/ANONYMISED-patterns.dtd\"><patterns><pat><either><pat><w lemma=\"$dependant\" syndep=\"MORPH:a\"/><pat repeat=\"*\"><w/></pat><w lemma=\"$synhead\" id=\"a\" /></pat><pat><w lemma=\"$synhead\" id=\"b\" /><pat repeat=\"*\"><w/></pat><w lemma=\"$dependant\" syndep=\"MORPH:b\"/></pat></either></pat></patterns>" > $pattern

# Second, detect all the interesting cases by mathing the pattern on the CONLL-
# parsed files. This is done in 2 steps: candidate extraction then candidate
# projection (annotate_mwe) back onto the parsed sentences. Output is 
# PlainCorpus format, underscores indicating MORPH constructions
echo "2. Joining the MORPH compounds -> plain text format" > /dev/stderr
cat $conllfile |
sed 's/\t*$//g' | # Extra tabs at the end of the file
python ${0%/*}/bin/candidates.py -S --from CONLL -p $pattern  > $candidates
cat $conllfile |
sed 's/\t*$//g' | # Extra tabs at the end of the file
python ${0%/*}/bin/annotate_mwe.py --corpus-from CONLL --candidates-from XML -d Source --to PlainCorpus -c $candidates 2> /dev/null |
tail -n +2 | # remove header comment
sed -E -e "s/^(.*)${dependant}_(.*)$/${morphnumber} \1${dependant} \2/g" | # Preceed MORPH by ${morphnumber}
sed -E -e "s/^([^${morphnumber}])/${nomorphnumber} \1/g" | # Preceed others by ${nomorphnumber}
cat > $auto

# Third, only keep lines between first and last occurrences of ** alone.
echo "3. Transforming the format of annotated files" > /dev/stderr
cat $annotfile | 
tail -n +2 | # remove header
sed 's/\t/ /g' | # replace tab by space (old format, sorry)
cat > $annot

# Fourth, compare the first columns of both files and calculate accuracy, that
# is, proportion of times the predicted MORPH link was correct
echo "4. Comparing the output with annotation" > /dev/stderr
lignes_annot=`wc $annot | awk '{print $1}'`
lignes_auto=`wc $auto | awk '{print $1}'`
if [ $lignes_annot -ne $lignes_auto ]; then
	echo "ERROR: annotation and parsed files do not have the same number of sentences!" > /dev/stderr
	wc $annot $auto
	echo "Not removing temp files $annot and $auto"
	rm $pattern $candidates
	exit -1
else
	cut -f 1 -d " " $annot  > $annot.col1
	paste $annot.col1 $auto > $compare
	awk 'BEGIN{ correct=0; correctannotauto=0; total=0; annot=0; auto=0; \
	  print "\nErrors:" > "/dev/stderr"; } { \
	  if( $1 == morphnumber || $1 == nomorphnumber){\
	  if( $1 == $2 ){ \
		correct++; \
		if( $1 == morphnumber) { correctannotauto++; } \
	  } else { \
		if( $1 == morphnumber) print "MORPH predicted as other"; \
		else print "other predicted as MORPH"; \
		print $0; \
	  }\
	  if( $1 == morphnumber ) { annot++; } \
	  if( $2 == morphnumber ) { auto++; } \
	  total++; \
	  }\
	} END{ \
	  if(auto!=0) p = correctannotauto/auto; else p=0.0; \
	  if(annot!=0)r = correctannotauto/annot; else r=0.0; \
	  if(p+r!=0) f = 2*p*r/(p+r); else f=0.0;\
	  print "\nOverall Accuracy = " correct/total > "/dev/stderr"; \
	  print "Nb of sentences = " total;\
	  print "Proportion auto  = " auto "/" total-auto > "/dev/stderr"; \
	  print "Proportion annot = " annot "/" total-annot > "/dev/stderr"; \
	  print "(pr) True Positives = " correctannotauto > "/dev/stderr"; \
	  print "(p )      Positives = " auto  > "/dev/stderr"; \
	  print "(r ) True           = " annot > "/dev/stderr"; \
	  print "Precision MORPH  = " p > "/dev/stderr"; \
	  print "Recall    MORPH  = " r > "/dev/stderr"; \
	  print "F-measure MORPH  = " f > "/dev/stderr"; \
	}' morphnumber=${morphnumber} nomorphnumber=${nomorphnumber} $compare
	rm $compare $annot.col1
fi

rm $annot $auto $pattern $candidates
