#!/usr/bin/python
# -*- encoding=utf-8 -*-

# usage: python matchingExamples.py <training-file.txt> <gold-file.txt> <hash-to-context-map.txt>


# Compares a training-file and a gold-file, according to example ids.
# Calculates
# - the total number of training examples,
# - the number of training examples with a gold match,
# - the number of training examples which have the gold match in their context,
# - the noise present in the training data,


import sys
import codecs

if __name__=="__main__":
	
	training = [x.rstrip('\r\n') for x in codecs.open(sys.argv[1],"r",encoding='utf-8').readlines()]
	gold = [x.rstrip('\r\n') for x in codecs.open(sys.argv[2],"r",encoding='utf-8').readlines()]
	context = {}
	trainings = {}
	golds = {}
	contexts = [x.rstrip('\r\n') for x in codecs.open(sys.argv[3],'r',encoding='utf-8').readlines()]
	
	for line in contexts:
		id, mrs = line.split()
		context[id] = mrs.split(";")
	
	for line in training:
		id, cont, nl = line.split("::")[0],line.split("::")[1].split()[0]," ".join(line.split("::")[1].split()[1:])				#map of id and context
		trainings[id] = (cont,nl)
	
	for line in gold:
		id, mrs, nl = line.split("::")[0],line.split("::")[1].split(' --> ')[1], line.split("::")[1].split(' --> ')[0]			#map of id and gold-meaning
		golds[id] = (mrs,nl)

	totalTraining = 0
	trainingInGold = 0
	trainingWithCorrect = 0
	trainingStringMatchesGoldString = 0
	
	tmp = trainings.keys()
	tmp.sort()
	for id in tmp:
		totalTraining += 1
		try:
			if golds[id][0] in context[trainings[id][0]]:		#check whether the correct meaning is present
				trainingWithCorrect += 1
#			else:
#				print "Missing: Example",id,"has goldmeaning",golds[id],"and context",context[trainings[id][0]]
			trainingInGold +=1						#just record that there is a correct meaning, even if it is not present in the context
			if golds[id][1] == trainings[id][1]:
				trainingStringMatchesGoldString += 1
			else:
				print "Discrepancy between surface strings for",id
				print golds[id][1]
				print trainings[id][1]
		except KeyError:
			pass
	
	print "Total # of traing-examples:",totalTraining
	print "      # of training-examples with Gold-Match:",trainingInGold
	print "      # of training-examples with Gold-Meaning in Context:",trainingWithCorrect
	print "Total # of gold-examples:",len(golds.keys())
	print "Noise (1 - Training with Gold in Context/Total Training):",1-(float(trainingWithCorrect)/float(totalTraining))
	print "Noise' (1 - Training with Gold / Total Training):",1-(float(trainingInGold)/float(totalTraining))
	print "Noise KM (1 - #Gold / Total Training):",1-((float(len(golds.keys())))/float(totalTraining))
	print "The strings match in",trainingStringMatchesGoldString,"cases"
