"""
Functions: 
    This program is to convert original SMS corpus to lexical normalisation task format.
Requirements: 
    dictionary object file
Input: 
    original corpus is from http://www.cel.iitkgp.ernet.in/~monojit/sms/word_aligned_parallel_corpus.txt
    The corpus is renamed (sms.raw) and saved in data folder.
"""
import pickle
import re

pattern = r"^\d+ (?P<in>[a-z0-9']+) \- (?P<out>[a-z0-9']+)$"
dictSet = pickle.load(open('../data/dict.pickle'))
f = open('../data/sms.raw')
inputList = []
normList = []
counter = 0
tCounter = 0

def outputSen(inputList, normList):
    global counter, tCounter
    if len(inputList) == 0:
        return
    num = len(inputList)
    flag = False
    for i in range(num):
        # Valid samples: input is OOV and norm is IV
        if inputList[i] != normList[i] and inputList[i] not in dictSet and normList[i] in dictSet:
            flag = True
            tCounter += 1
        else:
            normList[i] = inputList[i]
    if flag:
        print num
        counter += 1
        for i in range(num):
            print "{0}\t{1}".format(inputList[i], normList[i])
# Convert
while True:
    line = f.readline()
    if not line:
        break
    line = line.strip().lower()
    if len(line) == 0:
        outputSen(inputList, normList)
        inputList = []
        normList = []
    elif line.startswith("<sms"):
        continue
    else:
        m = re.match(pattern, line)
        if m is not None:
            ill = m.group("in")
            norm = m.group("out")
            inputList.append(ill)
            normList.append(norm)
f.close()
outputSen(inputList, normList)
#print counter, tCounter
