import os
import sys
import codecs
import random
import fasttext
import subprocess


def run_command(cmd):
  p = subprocess.Popen(cmd, shell=True)
  sts = os.waitpid(p.pid, 0)
  return


def buildSlm(workdir, tooldir, ngrams):
  for i in xrange(1,11,1):
    for dtype in ['questions', 'answers', 'wholeQA']:
      corpusfile = os.path.join(workdir, dtype + '.' + str(i) + '.corpus')
      binaryfile = os.path.join(workdir, dtype + '.' + str(i) + '.slm')
      arpafile = os.path.join(workdir, dtype + '.' + str(i) + '.arpa')
      print "\n\nBuilding ", ngrams, "-grams SLM arpa for label: ", i, "....\n\n"
      cmdout = run_command(tooldir + '/lmplz   -o ' + ngrams + '  --skip_symbols  <  ' + corpusfile + '  >  ' + arpafile)
      print "\n\ncompiling SLM for label:" , i, "..... \n\n"
      cmdout = run_command(tooldir + '/build_binary   ' +  arpafile  + '   ' + binaryfile)


def trainModel( workdir, tooldir, datatype, ngrams ):
  catFile = os.path.join(workdir, '../yahoo_answers.classes.txt')
  labeledFile = os.path.join(workdir, '../yahoo_answers.' + datatype + '.train')

  #initialize corpus files for each class
  questionCorpusF, answerCorpusF, wholeCorpusF =  {}, {}, {}
  for i in xrange(1,11,1):
    questionCorpusF['__label__' + str(i)] = codecs.open( workdir + '/questions.' + str(i)+ '.corpus' , 'w', 'utf-8' )
    answerCorpusF['__label__' + str(i)] = codecs.open(workdir + '/answers.' + str(i) + '.corpus', 'w', 'utf-8')
    wholeCorpusF['__label__' + str(i)] = codecs.open(workdir + '/wholeQA.' + str(i) + '.corpus', 'w', 'utf-8')

  #parsing raw labeled data
  with codecs.open( labeledFile , 'rb', 'utf-8') as rawF:
    for line in rawF:
      info = line.strip().split(',')
      if info[0].find('__label__') < 0:
        continue
      label = info[0].strip()
      question = info[1].strip().strip(',').strip()
      answer = ','.join(info[2:]).strip().strip(',').strip()
      QA = ','.join(info[1:]).strip().strip(',').strip()
      questionCorpusF[label].write(question + '\n')
      answerCorpusF[label].write(answer + '\n')
      wholeCorpusF[label].write(QA + '\n')

  # close file handlers
  for i in xrange(1, 11, 1):
    questionCorpusF['__label__' + str(i)].close()
    answerCorpusF['__label__' + str(i)].close()
    wholeCorpusF['__label__' + str(i)].close()

  buildSlm(workdir, tooldir , ngrams)


if __name__ == '__main__':
  if len(sys.argv) != 5:
    print " usage: python trainSLMs.py WorkDir KenlmDir DataType(full, labeled, unlabeled, or blurred15 etc.) ngrams(3 or 4 or 5) "
    exit(1)
  trainModel(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])


