"""
Function: 
    This module does basic language processing for tweets, including pinpoint IV words from OOV words, the tokenizer is adapted from NLTK toolkit tokenizer.
Requirements: 
    The input should be ASCII-encoded characters, relevant NLTK module and dictionary object file.
"""
import nltk
import pickle
import re
# tokenisation regular expression, adapted from NLTK tokeniser
pattern = r"""(?x)
         ([A-Za-z]\.){2,}
         |\s+http://[^\s]+
         |[#@]?\w+([-']\w+)*
         |\$?\d+(\.\d+)?%?
         |[\.]{2,}
         |[][.,;"'?!():-_`]
         """
dictSet = pickle.load(open('../data/dict.pickle', 'r'))

def tokenizeTweet(tweet):
    tweet = tweet.strip()
    tokenList = [w.lower().strip() for w in nltk.regexp_tokenize(tweet, pattern) if len(w.strip()) > 0]
    return tokenList

def isInDictionary(token):
    if token.lower().strip() in dictSet:
        return True   
    else:
        return False

def getOOVWords(tweet):
    return [w for w in tokenizeTweet(tweet) if w not in dictSet] 

def getIVWords(tweet):
    return [w for w in tokenizeTweet(tweet) if w in dictSet] 

def getCheckpoints(tokens):
    """
    Match concerned tokens
    """
    return [i for i, token in enumerate(tokens) if token not in dictSet and re.match("^[A-Za-z0-9'-]+$", token) and not re.match("^[\d]{2,}$", token)]

def getCheckpointList(tweet):
    tokens = tokenizeTweet(tweet)
    return getCheckpoints(tokens)

if __name__ == "__main__":
    print "Unit test"
    tweet = "I'm Mr Unknown from Z.Z. , @abc #abc I'm okay Ph-D degree. How are yu? and y. let's get ... and .... so that .. could be 2 you 2 11 four 4 tested."
    print tweet
    tokens = tokenizeTweet(tweet)
    print getOOVWords(tweet)
    print getIVWords(tweet)
    for i in getCheckpointList(tweet):
        print i, tokens[i]

