import sys
import re
import pprint
from os import listdir
from os.path import isfile, join
from nltk.corpus import wordnet as wn

print 'Input Folder (ACE 2005) : ' + sys.argv[1]
print 'Output Folder (CoNLL) : ' + sys.argv[2]
print 'Order of files : ' + sys.argv[3]

folders = []
"""folders.append('ACE_2005_Processed_depparse\\nw')
folders.append('ACE_2005_Processed_depparse\\bc')
folders.append('ACE_2005_Processed_depparse\\bn')
folders.append('ACE_2005_Processed_depparse\\wl')"""
folders.append(sys.argv[1])

fw_gold = open(join(sys.argv[2],'ACE2005.gold.conll'),'w')
fw_gold.write('#begin document null\n')

fw_files = open(sys.argv[3],'w')

curr_id = 0
mention_id_pattern = re.compile(r'\-\d+$')

def get_sent_str(sent):
	words = []
	for i in range(len(sent)):
		words.append(sent[i][0])
	if(not(words[-1].endswith('.') or words[-1].endswith('!') or words[-1].endswith('?') or words[-1].endswith("'"))):
		#words.append('.')
		print 'DOT ADDED: ' + ' '.join(words)
	return(' '.join(words))
	
def populate_sentToWordInfo_from_ACE(sentences):
	sentToWordInfo = {}
	for i in range(len(sentences)):
		indexToWordInfo = {}
		indexToWordInfo[0] = ('_ROOT_', '_ROOT_', {}, {}, 'O', [])
		for j in range(len(sentences[i])):
			word = sentences[i][j][0]
			postag = sentences[i][j][1]
			ner = 'O'
			parentDict = {}
			childDict = {}
			basicParent = []
			indexToWordInfo[j+1] = (word, postag, parentDict, childDict, ner, basicParent)
			
		for j in range(len(sentences[i])):
			parent = int(sentences[i][j][4])+1
			dep_rel_parent = sentences[i][j][3]
			indexToWordInfo[j+1][2][parent] = dep_rel_parent
			indexToWordInfo[parent][3][j+1] = dep_rel_parent
			indexToWordInfo[j+1][5].append(parent)
		sentToWordInfo[i] = indexToWordInfo
	return sentToWordInfo

def is_dependent_head(wordInfo, head_to_mid):
	isParentActor = False
	for parent_index, dep_rel in wordInfo[2].iteritems():
		if((parent_index-1) in head_to_mid and (dep_rel.startswith('nmod') or dep_rel == 'appos' or dep_rel in ['conj:and','conj:or'])):
			isParentActor = True
			break
	return isParentActor
	
def get_merged_complete_phrase(indexToWordInfo, head, head_to_mid, children_indices, merged_heads):
	children_indices.append(head)
	for child, dep_rel_child in indexToWordInfo[head][3].iteritems():
		if(dep_rel_child in [u'compound',u'det',u'amod',u'nmod:poss','det:qmod','nummod']):
			children_indices.append(child)
			if((child-1) in head_to_mid):
				merged_heads.append(child)
		elif((dep_rel_child.startswith('nmod') or dep_rel_child.startswith('appos') or dep_rel_child in ['conj:and','conj:or']) and (child-1) in head_to_mid):
			children_indices.append(child)
			if((child-1) in head_to_mid):
				merged_heads.append(child)
			get_merged_complete_phrase(indexToWordInfo, child, head_to_mid, children_indices, merged_heads)

def create_input_and_gold_files(filename):
	global curr_id
	sentences = []
	curr_sent = []
	group_to_id = {}
	f = open(join(folder,fl))
	fw = open(join(sys.argv[2],fl),'w')
	for line in f:
		line = line.strip()
		if(len(line) == 0):
			if(len(curr_sent) > 0):
				sentences.append(curr_sent)
				curr_sent = []
		else:
			parts = line.strip().split('\t')
			curr_sent.append(parts)
			if(parts[6] != 'o'):
				group_name = mention_id_pattern.sub('',parts[6])
				if(group_name not in group_to_id):
					group_to_id[group_name] = curr_id
					curr_id = curr_id + 1
	f.close()
	if(len(curr_sent) > 0):
		sentences.append(curr_sent)
		curr_sent = []
	
	sentToWordInfo = populate_sentToWordInfo_from_ACE(sentences)
	for sno,sent in enumerate(sentences):
		mid_to_start = {}
		mid_to_end = {}
		mid_to_etype = {}
		mid_to_head = {}
		head_to_mid = {}
		mid_to_parent = {}
		for i in range(len(sent)):
			if(sent[i][6] != 'o'):
				mid = sent[i][6]
				if(mid not in mid_to_start):
					mid_to_start[mid] = i
				mid_to_end[mid] = i
		
		for mid,start in mid_to_start.iteritems():
			end = mid_to_end[mid]
			for i in range(start,end+1):
				dep_parent = int(sent[i][4])
				if(dep_parent < start or dep_parent > end):
					mid_to_head[mid] = i
					head_to_mid[i] = mid
		
		heads_to_ignore = {}
		for mid, head in mid_to_head.iteritems():
			if(sent[head][5] in ['WEA','VEH']):
				heads_to_ignore[head] = ''
				print 'IgnoredNER: ' + sent[head][0] + '\t' + sent[head][5] + '\n' + get_sent_str(sent)
		
		for mid, head in mid_to_head.iteritems():
			if(head in heads_to_ignore):
				continue
			parent = int(sent[head][4])
			dep_rel_parent = sent[head][3]
			if(dep_rel_parent in [u'compound',u'det',u'amod',u'nmod:poss','det:qmod','nummod'] and parent in head_to_mid and parent not in heads_to_ignore):
				heads_to_ignore[head] = ''
				print 'Ignored: ' + sent[head][0] + '\t' + sent[parent][0] + '\n' + get_sent_str(sent)
				
		for mid, head in mid_to_head.iteritems():
			if(head in heads_to_ignore):
				continue
			
			i = head - 1
			while(i >= 0):
				if(int(sent[i][4]) == head and sent[i][3] in [u'compound',u'det',u'amod',u'nmod:poss','det:qmod','nummod']):
					if(i < mid_to_start[mid]):
						mid_to_start[mid] = i
				else:
					break
				i = i - 1
			i = head + 1
			while(i <= (len(sent)-1)):
				if(int(sent[i][4]) == head and sent[i][3] in [u'compound',u'det',u'amod',u'nmod:poss','det:qmod','nummod']):
					if(i > mid_to_end[mid]):
						mid_to_end[mid] = i
				else:
					break
				i = i + 1
		
		etypes = []
		conll_group = []
		for i in range(len(sent)):
			etypes.append('O')
			conll_group.append('-')
		
		for mid, head in mid_to_head.iteritems():
			print 'is_dependent_head : ' + sentToWordInfo[sno][head+1][0]
			if(is_dependent_head(sentToWordInfo[sno][head+1],head_to_mid)):
				print 'Dependent Head : ' + sentToWordInfo[sno][head+1][0]
				heads_to_ignore[head] = ''
			if(head in heads_to_ignore):
				continue
			children_indices = []
			merged_heads = []
			get_merged_complete_phrase(sentToWordInfo[sno], head+1, head_to_mid, children_indices, merged_heads)
			for merged_head in merged_heads:
				heads_to_ignore[merged_head-1] = ''
			sorted_children_indices = sorted(children_indices)
			start = sorted_children_indices[0]-1
			end = sorted_children_indices[-1]-1
			
			print mid + ':children_indices:' + str(children_indices)
			
			group_name = mention_id_pattern.sub('',mid)
			group_id = group_to_id[group_name]
			if(start == end):
				conll_group[start] = '(' + str(group_id) + ')'
			else:
				conll_group[start] = '(' + str(group_id)
				conll_group[end] = str(group_id) + ')'
			
			etype = sent[head][5]
			if(etype in ['GPE','FAC']):
				etype = 'LOC'
			etypes[start] = 'B-'+etype
			for i in range(start+1,end+1):
				etypes[i] = 'I-'+etype
		
		pprint.pprint(sentToWordInfo[sno])
		print 'mid_to_head:'
		pprint.pprint(mid_to_head)
		print 'mid_to_start:'
		pprint.pprint(mid_to_start)
		print 'mid_to_end:'
		pprint.pprint(mid_to_end)
		print 'mid_to_etype:'
		pprint.pprint(mid_to_etype)
		
		
		for i in range(len(sent)):
			line = str(fl)
			line = line + '\t' + str(sno) + '\t' + str(i) + '\t' + str(sent[i][0]) + '\t' + conll_group[i] + '\n'
			fw_gold.write(line)
		fw_gold.write('\n')
		fw.write(get_sent_str(sent)+'\n')
	fw.close()

for folder in folders:
	onlyfiles = [f for f in listdir(folder) if isfile(join(folder, f))]
	for fl in onlyfiles:
		if(str(fl).startswith('events') or str(fl).startswith('relations') or str(fl).startswith('mention')):
			continue
		fw_files.write(str(fl).replace('..txt','')+'\n')
		create_input_and_gold_files(join(folder,fl))
fw_gold.close()
fw_files.close()