#!/usr/bin/python
# -*- encoding=utf-8 -*-



# usage:	python processData.py <trainingFolder> <goldFolder> <outputfolder>

# converts the Chim,Ken and Mooney xml-data into simple string training data, i.e. sentences prefixed with context-identifiers
# this script assumes that contexts are sets (unlike CKM); to create multi-set data, use ./processData_multisets.py

# The script assumes that the alphabetical order of training-files in trainingFolder matches the alphabetic order of gold-files in goldFolder.
# Simply put, make sure that training1 corresponds to gold1, training2 to gold2 and so on.




# the following files are created in outputfolder:		-meanings.txt	-	list of all the meanings used in the training-data
#								-meanings1-meaningsn - list of the meanings in the training-sets 1 to n
#								-hashToCont.txt	-	list of ContextIdentifiers (Number) with their Plain-Text Context-Representation (String)
#								-training/training1,...trainingN (converted trainingfiles, format is:	"exampleID::SentencePrefixedWithContextIdentifier")
#								-gold/gold1,...,goldN (converted goldfiles, format is: "exampleID::Sentence[WITHOUTcontextIdentifier] -> GoldMR")
#								-goldtraining1,...,goldtrainingN (unambigous trainingdata, for upper bound experiment)
																										

from xml.sax import make_parser
from xmlparser import ParseTrainingFile, ParseGoldStandard
import xml
import os
import sys
import codecs



def convert_training(trainingFolder,outputf, words):
	"""creates training-files from the xml-training files provided by CKM"""
	if not os.path.exists(outputf+"/training"):
		os.makedirs(outputf+"/training")
	setCounter = 1
	hashToCont = codecs.open(outputf+"/hashToCont.txt","w",encoding='utf-8')	#the context-to-hash mapping
	meaningsOut = codecs.open(outputf+"/meanings.txt","w",encoding='utf-8')		#the meanings present in the data 
	statsOut = codecs.open(outputf+"/statistics.txt","w",encoding='utf-8')		#where some statistics are written
	contToHash = {}
	meanings = []
	trainingFs = os.listdir(trainingFolder)
	trainingFs.sort()
	for training in trainingFs:
		#<read in the xml-file>
		print "Processing",training
		parser = make_parser()
		semForms = []
		nlSents = []
		examples = 0
		longestContext = 0
		totalEvents = 0
		cHandler = ParseTrainingFile(semForms,nlSents)
		parser.setContentHandler(cHandler)
		try:
			parser.parse(trainingFolder+"/"+training)
		except xml.sax._exceptions.SAXParseException:
			print training,"is not a valid file, skipped"
			continue
		#</ read in the xml-file>
		training_new = codecs.open(outputf+"/training/training"+str(setCounter)+".txt","w",encoding='utf-8')		
		#now write out the plain-strings
		for sent, semids, sentid in nlSents:
			examples += 1
			for w in sent.split():
				words[w] = 1
			prefix = ""
			for id in semids:
				prefix += semForms[id]+";"
			#!!! remove duplicate events - meaning-SET; for multi-sets, use processData_multisets.py !!!
			prefix = set(prefix[:-1].split(';'))
			totalEvents += len(prefix)
			if len(prefix) > longestContext:
				longestContext = len(prefix)
			res=str(hash(';'.join(prefix)))+' '+sent
			contToHash[';'.join(prefix)] = str(hash(';'.join(prefix)))
			training_new.write(sentid+"::"+res+"\n")
		statsOut.write("Statistics for "+training+":\n-------------- \
Number of examples:"+str(examples)+"\n \
Total events:"+str(totalEvents)+"\n \
Different meanings:"+str(len(semForms))+"\n \
Longest Context:"+str(longestContext)+"\n \
Average Ambiguity:"+str(float(totalEvents)/float(examples))+"\n------\n-----")
		#store the meanings present in this specific training file			
		meanings_n = codecs.open(outputf+"/meaning"+str(setCounter)+".txt","w",encoding='utf-8')
		for m in set(semForms):
			meanings_n.write(m+"\n")
		meanings_n.close()
		meanings += semForms
		training_new.close()
		setCounter += 1

		
	#save all the meanings
	
	for sem in set(meanings):
			meaningsOut.write(sem+"\n")
	meaningsOut.close()			
	
	#save the context-hash-map
	for cont in contToHash.keys():
		hashToCont.write(contToHash[cont]+" "+cont+"\n")
	hashToCont.close()

def convert_gold(goldFolder, outputF, words):
	"""creates gold-files and gold-training files from the xml-gold files provided by CKM"""
	if not os.path.exists(outputF+"/gold"):
		os.makedirs(outputF+"/gold")
	setCount = 1
	goldContext = {}
	goldFs = os.listdir(goldFolder)
	goldFs.sort()
	for gold in goldFs:
		#read in the xml-file
		print "Processing",gold
		parser = make_parser()
		NLMap = {}
		cHandler = ParseGoldStandard(NLMap)
		parser.setContentHandler(cHandler)
		try:
			parser.parse(goldFolder+"/"+gold)
		except xml.sax._exceptions.SAXParseException:
			print gold,"is not a valid file, skipped"
			continue

		gold_new = codecs.open(outputF+"/gold/gold"+str(setCount)+".txt","w",encoding='utf-8')
		gold_training = codecs.open(outputF+"/training/trainingGold"+str(setCount)+".txt","w",encoding='utf-8')		
		for sent in NLMap.keys():
			if NLMap[sent] not in goldContext.keys():	#introduce a hash for each meaning
				goldContext[NLMap[sent]] = str(hash(NLMap[sent]))
			for w in sent.split("::")[1].split():
				words[w] = 1
			gold_new.write(sent+" --> "+NLMap[sent]+"\n")
			gold_training.write(sent.split("::")[0]+"::"+goldContext[NLMap[sent]]+" "+sent.split("::")[1]+"\n")
		gold_new.close()
		gold_training.close()
		setCount += 1
	goldHashToContext = codecs.open(outputF+"/goldHashToContext.txt","w",encoding='utf-8')
	goldMeanings = codecs.open(outputF+"/goldMeanings.txt","w",encoding='utf-8')
	for cont in goldContext.keys():
		goldHashToContext.write(goldContext[cont]+" "+cont+"\n")
		goldMeanings.write(cont+"\n")
	goldMeanings.close()
	goldHashToContext.close()

if __name__ == "__main__":
	trainingfolder = sys.argv[1]
	goldfolder = sys.argv[2]
	outputfolder = sys.argv[3]
	if not os.path.exists(outputfolder):
		os.makedirs(outputfolder)
	words = {}
	convert_training(trainingfolder,outputfolder, words)
	convert_gold(goldfolder,outputfolder, words)
	vocabulary = codecs.open(outputfolder+"/vocabulary.txt","w",encoding='utf-8')
	#save the vocabulary
	for word in words.keys():
		vocabulary.write(word+"\n")
	vocabulary.close()
	

