#!/usr/bin/python
# -*- encoding=utf-8 -*-

# usage: python matchingExamplesStrings.py <training-file.txt> <gold-file.txt> <hash-to-context-map.txt>


# Compares a training-file and a gold-file, accomodating the fact that often times,
# the example ids for gold-file and training-file do not match, and matching the
# examples according to their surface-string (and their relative position in the
# file)

# Calculates
# - the total number of training examples,
# - the number of training examples with a gold match,
# - the number of training examples which have the gold match in their context,
# - the noise present in the training data,


import sys
import codecs

def findFirst(someList, someSent):
	for x in range(len(someList)):
		if someList[x][1] == someSent:
			return x
	return -1


if __name__=="__main__":
	
	training = [x.rstrip('\r\n') for x in codecs.open(sys.argv[1],"r",encoding='utf-8').readlines()]
	gold = [x.rstrip('\r\n') for x in codecs.open(sys.argv[2],"r",encoding='utf-8').readlines()]
	context = {}
	trainings = {}
	golds = []
	contexts = [x.rstrip('\r\n') for x in codecs.open(sys.argv[3],'r',encoding='utf-8').readlines()]
	
	for line in contexts:
		id, mrs = line.split()
		context[id] = mrs.split(";")
	
	for line in training:
		id, cont, nl = line.split("::")[0],line.split("::")[1].split()[0]," ".join(line.split("::")[1].split()[1:])				#map of id and context
		trainings[id] = (cont,nl)
	
	for line in gold:
		id, mrs, nl = line.split("::")[0],line.split("::")[1].split(' --> ')[1], line.split("::")[1].split(' --> ')[0]			#map of id and gold-meaning
		golds.append((mrs,nl))

	totalTraining = 0
	trainingInGold = 0
	trainingWithCorrect = 0
	noMatch = 0
	
	tmp = trainings.keys()
	tmp.sort(lambda x,y:cmp(int(x),int(y)))
	goldLen = len(golds)
	for id in tmp:	#go through the sentences in order
		totalTraining += 1

		pos = findFirst(golds,trainings[id][1])
		if pos >= 0:		#found a match
			trainingInGold += 1
			if golds[pos][0] in context[trainings[id][0]]:
				trainingWithCorrect += 1
			else:
				print "Missing: Example",id,"has goldmeaning",golds[pos][0],"and context",context[trainings[id][0]]
			golds.pop(pos)			#only match everything once
			
		else:
			noMatch += 1
			#print "No Match:",trainings[id][1]
			#pass
		


	if len(golds) != 0:
		print "Unmatched Gold-Examples:"
		for left in golds:
			print "\t",left[1]
	
	print "Total # of traing-examples:",totalTraining
	print "      # of training-examples with Gold-Match:",trainingInGold
	print "      # of training-examples with Gold-Meaning in Context:",trainingWithCorrect
	print "Total # of gold-examples:",goldLen
	print "Noise (1 - Training with Gold in Context/Total Training):",1-(float(trainingWithCorrect)/float(totalTraining))
	print "Noise' (1 - Training with Gold / Total Trianing):",1-(float(trainingInGold)/float(totalTraining))
	print "Noise KM (1 - #Gold / Total Training):",1-((float(goldLen))/float(totalTraining))
