#!/usr/bin/python
# -*- encoding=utf-8 -*-

# parses the Chen, Kim and Mooney xml-data files
# NB:	xml.sax does not work with enc-kr, so that you need to convert the Korean data into utf-8.
# You can use the bash-script ./convert_korean.sh to do this


from xml.sax import ContentHandler
from xml.sax.handler import feature_namespaces

def normalize_whitespace (someString):
	return " ".join(someString.split())

class ParseTrainingFile(ContentHandler):
	def __init__(self, pointerToSemList, pointerToSentList):
		self.in_sem = False
		self.in_sent = False
		self.in_semid = False
		self.in_nl = False
		self.pointerToSem = pointerToSemList
		self.pointerToSent = pointerToSentList
			
	def startElement(self, name, attrs):
		if name == "sem":
			self.in_sem = True
			self.sem = ""
		elif name == "example":
			self.in_sent = True
			self.id = str(attrs.get("id"))
		elif name == "semid":
			self.semids = ""
			self.in_semid = True			
		elif name == "nl":
			self.nl = ""
			self.in_nl = True
			
	def characters(self, c):
		if self.in_sem:
			self.sem += c
		elif self.in_nl:
			self.nl += c
		elif self.in_semid:
			self.semids += c
	
	def endElement(self, name):
		if name=="sem":
			self.in_sem = False
			self.pointerToSem.append(normalize_whitespace(self.sem).replace(" ",""))
		elif name=="nl":
			self.nl = normalize_whitespace(self.nl)
			self.in_nl = False
		elif name=="semid":
			self.in_semid = False
			semids = normalize_whitespace(self.semids)
			self.semids = [int(x) for x in semids.split()]
		elif name=="example":
			self.in_sent = False
			self.pointerToSent.append((self.nl.lower(),self.semids,self.id))

class ParseGoldStandard(ContentHandler):
	def __init__(self, pointerToNLMap):
		self.in_ex = False
		self.in_nl = False
		self.in_sem = False
		self.pointerToNLMap = pointerToNLMap
			
	def startElement(self, name, attrs):
		if name == "mrl":
			self.in_sem = True
			self.sem = ""
		elif name == "example":
			self.in_sent = True
			self.id = str(attrs.get("id"))
		elif name == "nl":
			self.nl = ""
			self.in_nl = True
			
	def characters(self, c):
		if self.in_sem:
			self.sem += c
		elif self.in_nl:
			self.nl += c
	
	def endElement(self, name):
		if name=="mrl":
			self.in_sem = False
			self.pointerToNLMap[self.id+"::"+self.nl.lower()] = normalize_whitespace(self.sem).replace(" ","")
		elif name=="nl":
			self.nl = normalize_whitespace(self.nl)
			self.in_nl = False
