import os
import sys
import codecs
import random
import fasttext
import subprocess

def run_command(cmd):
  p = subprocess.Popen(cmd, shell=True)
  sts = os.waitpid(p.pid, 0)
  return



def writeFastTextCorpus(data, filename):
  random.shuffle(data)
  with codecs.open(filename, 'w', 'utf-8') as corpusF:
    for sample in data:
      corpusF.write(sample)



def trainModel(corpusFile, modelfile, testfile,  tooldir ):
  # set params
  dim = "10"
  lr =  "0.1"
  epoch = "5"
  minCount = "1"
  word_ngrams = "2"
  bucket = "10000000"
  loss = "softmax"

  # calculate Slp and Rank score
  cmdout = run_command( tooldir + '/fasttext ' + " supervised  -input  " + corpusFile + "  -output  " + modelfile \
                + " -dim  " + dim + " -lr  " + lr + " -epoch  " + epoch + " -minCount  " + minCount + " -bucket " + bucket \
                + " -thread 1 -minn 0  -maxn 0  " + " -loss " + loss + " -wordNgrams " + word_ngrams )

  # Train the classifier
  classifier = fasttext.load_model( modelfile + '.bin', label_prefix='__label__')

  # Test the classifier
  result = classifier.test(testfile)
  print "Current round model precison: ", result.precision, " with model: ", os.path.basename(modelfile)
  return result.precision




def coTrainingSingle(workdir, tooldir):

  #load manual labelled data as baseline
  initLabelSamples = []
  with codecs.open( workdir + '/../yahoo_answers.labeled.train' , 'rb', 'utf-8') as trainf:
    for line in trainf:
      initLabelSamples.append( line )

  #load unlabeled data
  unLabeledData = []
  with codecs.open(workdir + '/../yahoo_answers.unlabeled.train', 'r', 'utf-8') as rawf:
    for line in rawf:
      info  = line.strip().split(',')
      content = ','.join(info[1:]) # remove and forget labels
      if len(content.strip()) < 1:
        #print "Error found in unlabeled data line: ", content, " \n Org line:\n", line
        continue
      unLabeledData.append( content )

  print 'Total labeled data samples: ', len(initLabelSamples), ' , unlabeledData: ' ,  len(unLabeledData)

  trainCorpusA, trainCorpusB = initLabelSamples, initLabelSamples
  trainFileA, trainFileB = os.path.join(workdir, 'CoTrainCorpusA' ), os.path.join(workdir, 'CoTrainCorpusB' )
  modelA, modelB = os.path.join(workdir, 'CoTrain.modelA' ), os.path.join(workdir, 'CoTrain.modelB' )
  testfile = os.path.join(workdir, '../yahoo_answers.test')
  modelPerformanceA, modelPerformanceB  = [], []

  print 'Initializaing model A ...'
  writeFastTextCorpus(trainCorpusA, trainFileA)
  curModelPrecision = trainModel( trainFileA, modelA , testfile , tooldir)
  modelPerformanceA.append(curModelPrecision)

  print 'Initializaing model B ...'
  writeFastTextCorpus(trainCorpusB, trainFileB)
  curModelPrecision = trainModel(trainFileB, modelB , testfile, tooldir)
  modelPerformanceB.append(curModelPrecision)

  for itr in xrange(50):
    print '\n\n\nCo-Training  iterating #', itr+1, ' ....'
    # enrich corpus A
    viewASamples = enrichingCorpus( modelB , unLabeledData )

    #TODO: try another option (with 3M samples): traincorpusA = initLabelSamples + viewASamples
    trainCorpusA = initLabelSamples + viewASamples
    print ' Updating  model A ...'
    writeFastTextCorpus(trainCorpusA, trainFileA)
    curModelPrecision = trainModel(trainFileA, modelA, testfile, tooldir)
    modelPerformanceA.append(curModelPrecision)

    # enrich corpus B
    viewBSamples = enrichingCorpus( modelA , unLabeledData )
    trainCorpusB = initLabelSamples + viewBSamples
    writeFastTextCorpus(trainCorpusB, trainFileB)
    print "Updating model B ..."
    curModelPrecision = trainModel(trainFileB, modelB , testfile, tooldir)
    modelPerformanceB.append(curModelPrecision)

  #summarize performance
  print 'Interation Performances of Model A are: ', str(modelPerformanceA)
  print 'Interation Performances of Model B are: ', str(modelPerformanceB)
  with codecs.open(os.path.normpath(workdir + '/AccuracyReport.cotrainingSingleView.txt'), 'w', 'utf-8') as outf:
    outf.write("\n\nSingle View CoTraining Performance Logs for ModelA is:\n " + str(modelPerformanceA) + '\n\n\n')
    outf.write("\n\nSingle View CoTraining Performance Logs for ModelB is:\n " + str(modelPerformanceB) + '\n\n\n')



def enrichingCorpus( modelFile , unlabeledData ):
  # corpus enriching policy is based on  classifier's confidence level
  ENRICH_COUNTS = 800000
  classifier = fasttext.load_model( modelFile + '.bin', label_prefix='__label__')
  print 'enriching co-training corpus data ...'
  predictResult = []
  for sample in unlabeledData:
    if len(sample.strip()) < 1:
      print 'Error during enriching corpus!!!\n Sample:\n', sample
      continue
    result = classifier.predict_proba([sample], k=10)[0]
    predCat = result[0][0]
    predConf = result[0][1]
    predictResult.append((sample, predCat, predConf))

  sortedSamples = sorted( list(predictResult), key=lambda x: x[2], reverse=True )
  selectedHighConf = sortedSamples[:ENRICH_COUNTS]
  highConfSamples =  zip( zip(*selectedHighConf)[0] , zip(*selectedHighConf)[1] )

  viewSamples=[]
  for entry in highConfSamples:
    content, predCat = entry
    viewSamples.append( '__label__' + predCat  + '\t , ' + content + '\n')
  return viewSamples


if __name__ == '__main__':
  if len(sys.argv) != 3:
    print " usage: python cotrainingSingle.py WorkDir ToolDir"
    exit(1)
  coTrainingSingle(sys.argv[1], sys.argv[2])


