# this program converts a timebank 1.2 file to tempeval 2 format 

import sys
import re
dct_tmp_off = 'false' 

debug = 1 

def bar (index):
    #for arg in sys.argv:
    if debug >= 2: 
        print sys.argv[index]
    return sys.argv[index]


def extract_name(filename):
    parts = re.split('/', filename)
    length = len(parts)
    return parts[length-1]

## absolute path of basedir 
# basedir = '/Users/naushadzaman/Documents/work/research/URCS/temporal-ordering/experiments/TimeBank/Timebank-1-2/TimeBank_1_2/'

## relative path of basedir 
#basedir = './'
basedir = bar(1) 
filename = bar(2)

timebank_file = basedir + 'data/1-corpus-data/' + re.sub('txt', 'tml', extract_name(filename))
tempeval_dir = basedir + 'data/'
dct_dir = tempeval_dir + '3-dct/'
all_tlinks_dir = tempeval_dir + '2-all-tlinks/'
tlinks_event_event_dir = tempeval_dir + 'tinks-event-event/'
tlinks_dct_event_dir = tempeval_dir + 'tlinks-dct-event/' 
tlinks_timex_event_dir = tempeval_dir + 'tlinks-timex-event/' 
tlinks_timex_timex_dir = tempeval_dir + 'tlinks-timex-timex/' 

event_list = {}
timex_list = {}
eiid_to_eid = {}

class Event: 
    def __init__(self, eid_param):
        self.eid = eid_param 
        self.eiid = '' 
        self.class2 = ''
        self.tense = '' 
        self.aspect = '' 
        self.pos = '' 
        self.polarity = '' 

class Timex: 
    def __init__(self, tid_param):
        self.tid = tid_param 
        self.type = '' 
        self.value = '' 

def take_word_from_text_and_return_id(word, text):    
    if re.search(word+'=\"[^\"]*\"', text): 
        feature = re.findall(word+'=\"[^\"]*\"', text)[0]
        feature = re.sub('\"', '', feature) 
        feature = re.sub(word+'=', '', feature) 
        if debug >= 2: 
            print 'FEATURE:', feature 
        return feature 
    else: 
        return 'NONE' 

def reverse_relation(relation): 
    if relation == 'BEFORE': 
        return 'AFTER' 
    elif relation == 'AFTER': 
        return 'BEFORE' 
    elif relation == 'IBEFORE': 
        return 'IAFTER' 
    elif relation == 'IAFTER': 
        return 'IBEFORE' 
    elif relation == 'DURING': 
        return 'DURING' 
    elif relation == 'BEGINS': 
        return 'BEGUN_BY' 
    elif relation == 'BEGUN_BY': 
        return 'BEGINS' 
    elif relation == 'ENDS': 
        return 'ENDED_BY'
    elif relation == 'ENDED_BY': 
        return 'ENDS' 
    elif relation == 'IS_INCLUDED': 
        return 'INCLUDES'
    elif relation == 'INCLUDES': 
        return 'IS_INCLUDED'
    else: 
        return relation 


def timebank_to_tempeval(): 
    timebank_text = open(timebank_file).read() 
    name = re.sub('.tml', '', extract_name(filename))


    ## handle dct 
    dct = re.findall('<TIMEX[^>]*\"CREATION_TIME\">', timebank_text) + re.findall('<TIMEX[^>]*\"PUBLICATION_TIME\">', timebank_text)

    if len(dct) == 1 and dct_tmp_off == 'false': 
        # found the dct 
        if debug >= 2: 
            print dct 
        val = take_word_from_text_and_return_id('value', dct[0])
        tid = take_word_from_text_and_return_id('tid', dct[0])
        value = val.split('T')[0] 
        value = re.sub('-', '', value) 
        str = name + '\t' + value + '\t' + tid 
        dct = tid 
        if debug >= 2: 
            print str 
        dctfile = open(dct_dir + name + '.txt', 'w') 
        dctfile.write(str+'\n')
        dctfile.close() 
        
    elif len(dct) != 1: 
        print name
        print 'Multiple/None DCT! Error!!!' 
        exit 

    all_tlinks_file = open(all_tlinks_dir+name+'.txt', 'w')
    tlinks_event_event = open(tlinks_event_event_dir+name+'.txt', 'w') 
    tlinks_dct_event = open(tlinks_dct_event_dir+name+'.txt', 'w') 
    tlinks_timex_event = open(tlinks_timex_event_dir+name+'.txt', 'w') 
    tlinks_timex_timex = open(tlinks_timex_timex_dir+name+'.txt', 'w')

    ## handle 
    for line in timebank_text.split('\n'): 
        if line.strip() == "": 
            continue 
        timexes = re.findall('<TIMEX3[^>]*[^<]*</TIMEX3>', line)
        for timex in timexes: 
            if debug >= 2: 
                print line 
            tid = take_word_from_text_and_return_id('tid', timex)
            new_timex = re.sub('<TIMEX3[^>]*>', '', timex) 
            new_timex = re.sub('</TIMEX3>', '', new_timex) 
            new_timex = tid+'_s_'+new_timex+'_'+tid+'_e'
            type = take_word_from_text_and_return_id('type', timex)
            value = take_word_from_text_and_return_id('value', timex)
            if debug >= 2: 
                print new_timex 
                print '' 
            tmp_timex = Timex(tid) 
            tmp_timex.type = type 
            tmp_timex.value = value 
            timex_list[tid] = tmp_timex 
            if debug >= 2:
                print timex_list[tid].tid, timex_list[tid].type, timex_list[tid].value 


        events = re.findall('<EVENT[^>]*[^<]*</EVENT>', line)            
        for event in events: 
            eid = take_word_from_text_and_return_id('eid', event)
            class2 = take_word_from_text_and_return_id('class', event)
            if debug >= 2: 
                print line
                print eid, class2
            tmp_event = Event(eid)
            tmp_event.class2 = class2 
            event_list[eid] = tmp_event 
            if debug >= 2: 
                print event_list[eid].eid, event_list[eid].class2
            
        instances = re.findall('<MAKEINSTANCE[^>]*>', line)            
        for instance in instances: 
            eid = take_word_from_text_and_return_id('eventID', instance)
            eiid = take_word_from_text_and_return_id('eiid', instance)
            eiid_to_eid[eiid] = eid 
            tense = take_word_from_text_and_return_id('tense', instance)
            aspect = take_word_from_text_and_return_id('aspect', instance)
            polarity = take_word_from_text_and_return_id('polarity', instance)
            pos = take_word_from_text_and_return_id('pos', instance)
            event_list[eid].eiid = eiid
            event_list[eid].tense = tense 
            event_list[eid].aspect = aspect 
            event_list[eid].polarity = polarity 
            if debug >= 2: 
                print event_list[eid].eid, event_list[eid].eiid, event_list[eid].class2, event_list[eid].tense, event_list[eid].aspect, event_list[eid].polarity 
            if debug >= 2: 
                print instance 

        tlinks = re.findall('<TLINK[^>]*>', line)            
        for tlink in tlinks: 

            if re.search('eventInstanceID', tlink) and re.search('relatedToEventInstance', tlink): 
                # tlinks_event_event
                eventInstanceID = take_word_from_text_and_return_id('eventInstanceID', tlink)
                relatedToEventInstance = take_word_from_text_and_return_id('relatedToEventInstance', tlink)
                e1 = eiid_to_eid[eventInstanceID] 
                e2 = eiid_to_eid[relatedToEventInstance] 
                relType = take_word_from_text_and_return_id('relType', tlink) 
                if debug >= 2: 
                    print tlink
                    print e1, e2, relType
                all_tlinks_file.write(name+'\t'+e1+'\t'+e2+'\t'+relType+'\n')
                tlinks_event_event.write(name+'\t'+e1+'\t'+e2+'\t'+relType+'\n')

            elif re.search('eventInstanceID', tlink) and re.search('relatedToTime', tlink): 
                # tlinks_timex_event
                # tlinks_dct_event
                eventInstanceID = take_word_from_text_and_return_id('eventInstanceID', tlink)
                relatedToTime = take_word_from_text_and_return_id('relatedToTime', tlink)
                e1 = eiid_to_eid[eventInstanceID] 
                t1 = relatedToTime 
                relType = take_word_from_text_and_return_id('relType', tlink) 
                if re.search('"'+dct+'"', tlink): 
                    tlinks_dct_event.write(name+'\t'+e1+'\t'+t1+'\t'+relType+'\n')
                else:
                    tlinks_timex_event.write(name+'\t'+e1+'\t'+t1+'\t'+relType+'\n')
                all_tlinks_file.write(name+'\t'+e1+'\t'+t1+'\t'+relType+'\n')
                if debug >= 2: 
                    print tlink
                    print e1, t1, relType 

            elif re.search('timeID', tlink) and re.search('relatedToEventInstance', tlink):
                # tlinks_timex_event
                timeID = take_word_from_text_and_return_id('timeID', tlink)
                relatedToEventInstance = take_word_from_text_and_return_id('relatedToEventInstance', tlink)
                t1 = timeID 
                e1 = eiid_to_eid[relatedToEventInstance] 
                relType = take_word_from_text_and_return_id('relType', tlink) 
                if debug >= 2: 
                    print tlink
                    print t1, e1, relType, reverse_relation(relType)  
                all_tlinks_file.write(name+'\t'+e1+'\t'+t1+'\t'+reverse_relation(relType)+'\n')
                tlinks_timex_event.write(name+'\t'+e1+'\t'+t1+'\t'+reverse_relation(relType)+'\n')

            elif re.search('timeID', tlink) and re.search('relatedToTime', tlink):
                # tlinks_timex_timex
                t1 = take_word_from_text_and_return_id('timeID', tlink)
                t2 = take_word_from_text_and_return_id('relatedToTime', tlink)
                relType = take_word_from_text_and_return_id('relType', tlink) 
                if debug >= 2: 
                    print tlink
                    print t1, t2, relType 
                all_tlinks_file.write(name+'\t'+t1+'\t'+t2+'\t'+relType+'\n')
                tlinks_timex_timex.write(name+'\t'+t1+'\t'+t2+'\t'+relType+'\n')
            else: 
                print tlink 

    all_tlinks_file.close() 
    tlinks_event_event.close() 
    tlinks_dct_event.close() 
    tlinks_timex_event.close()
    tlinks_timex_timex.close() 

timebank_to_tempeval() 


