"""
Functions:
    This module is to generate context support features from dependencies.
Input:
    dependency file generated by Stanford parsers described in the paper.
"""
import os
import re
import pickle

pattern = r"(?P<relation>\w+)\((?P<w1>[\w']+)\-(?P<p1>\d+), (?P<w2>[\w']+)\-(?P<p2>\d+)\)"

def extractData(line, isFullSupp, flagDict):
    """
    Parse dependencies
    """
    m = re.match(pattern, line)
    suppList = list()
    if m is not None:
        w1 = m.group('w1')
        w2 = m.group('w2')
        p1 = int(m.group('p1'))
        p2 = int(m.group('p2'))
        if p1 in flagDict: # deal with contractions and modify dependency relative positions accordingly.
            tmp = flagDict[p1]
            if "'" in w1 and p1 > p2:
                p1 -= 1
            elif "'" not in w1 and p1 < p2:
                p1 += 1
            w1 = tmp
        elif p2 in flagDict:
            tmp = flagDict[p2]
            if "'" in w2 and p1 < p2:
                p2 -= 1
            elif "'" not in w2 and p1 > p2:
                p2 += 1
            w2 = tmp
        if isFullSupp == True: # flat dictionary
            suppList.append("{0}{1}{2}".format(w1, str(p1 - p2), w2))
            suppList.append("{0}{1}{2}".format(w2, str(p2 - p1), w1))
        else: # tuples hierarchical dictionary
            suppList.append(("{0}{1}".format(w1, str(p1 - p2)), w2))
            suppList.append(("{0}{1}".format(w2, str(p2 - p1)), w1))
    return suppList

def getflagDict(line):
    tokenList = line.split(' ')
    flagDict = {}
    for i in range(len(tokenList)):
        if "'" in tokenList[i]:
            flagDict[i] = tokenList[i-1]+tokenList[i]
            flagDict[i+1] = flagDict[i]
    return flagDict

def genSupp(inFile, isFullSupp):
    """
    Obtain support context dictionary
    """
    f = open(inFile, 'r')
    queryDict = dict()
    counter = 0
    tokenList = []
    while True:
        line = f.readline()
        if not line:
            break
        """
        Deal with contraction scenarios (e.g. They're, She's, I'am)
        Replace dependency without split, "'re -> They're"
        And modify position accordingly.
        """
        flagDict = getflagDict(line.strip().lower()) #deal with contraction scenarios
        line = f.readline()        
        while True:
            line = f.readline().rstrip().lower()
            if len(line)== 0:
                break            
            suppList = extractData(line, isFullSupp, flagDict) # get support features
            if len(suppList) == 0:
                continue

            for supp in suppList:
                if isFullSupp:
                    if supp not in queryDict:
                        queryDict[supp] = 0
                    queryDict[supp] += 1
                else:
                    if supp[0] not in queryDict:
                        queryDict[supp[0]] = dict()
                    if supp[1] in queryDict[supp[0]]:
                        queryDict[supp[0]][supp[1]] += 1
                    else:
                        queryDict[supp[0]][supp[1]] = 1
            counter += 1
    f.close()
    return queryDict

if __name__ == "__main__":
    inFile = "../data/gigaword.typeddependencies"
    outFile = "../data/contextSupport.pickle"
    outFullFile = "../data/contextDect.pickle"
    # obtain suppDict.pickle for ill-formed word normalisation
    queryDict = genSupp(inFile, False)
    pickle.dump(queryDict, open(outFile, 'w'))
    # obtain suppDictFull.pickle for ill-formed word detection
    queryDictFull = genSupp(inFile, True)
    pickle.dump(queryDictFull, open(outFullFile, 'w'))

