"""
Functions: 
    This module is to analyze the input OOV words distribution in different copora.
Requirements: 
    dictionary object file
Input: 
    corpora with one sentence per line
"""
import os
import langToolkit
import pickle

wordSet = pickle.load(open('../data/dict.pickle', 'r'))
def getOOVDist(infile):
   """Get the OOV distribution in the corpus"""
   fin = open(infile, 'r')   
   ratioDict = dict()
   CHUNK_SIZE = 96900
   counter = 0
   while True:
      lines = fin.readlines(CHUNK_SIZE)
      if not lines:
         break
      for line in lines:
         line = line.lower().strip()
         tokens = langToolkit.tokenizeTweet(line)
         IVs = len(langToolkit.getIVWords(line))
         OOVs = len(langToolkit.getCheckpoints(tokens))
         if IVs + OOVs == 0:
            continue
         if OOVs == 0:
            ratio = -1 # no OOV words
         else:
            ratio = float(OOVs * 10) / (OOVs + IVs)
         counter += 1
         ratio = int(ratio)
         if ratio in ratioDict:
            ratioDict[ratio] += 1
         else:
            ratioDict[ratio] = 1
   fin.close()
   clist = sorted(ratioDict.iteritems(), key=lambda(k,v):(k,v))
   print "{0} :{1}".format(infile, counter)
   for kvp in clist:
      print "{0}\t{1}".format(kvp[0], float(kvp[1])/counter)

if __name__ == "__main__":
   getOOVDist('../data/combSMS')
