import os
import sys
import codecs
import random
import fasttext
import numpy
import subprocess

def run_command(cmd):
  p = subprocess.Popen(cmd, shell=True)
  sts = os.waitpid(p.pid, 0)
  return



def writeFastTextCorpus(data, filename):
  random.shuffle(data)
  with codecs.open(filename, 'w', 'utf-8') as corpusF:
    for sample in data:
      corpusF.write(sample)


def ActiveLearn(workdir, tooldir ):

  #load manual labelled data as baseline
  initLabelSamples = []
  with codecs.open( workdir + '/../yahoo_answers.labeled.train' , 'rb', 'utf-8') as trainf:
    for line in trainf:
      initLabelSamples.append( line )

  #load unlabeled data
  unLabeledData = []
  unLabeledDataOrgMap = {} #remember those unLabeledData's original label only used to simulate human labeling during active learning
  with codecs.open(workdir + '/../yahoo_answers.unlabeled.train', 'r', 'utf-8') as rawf:
    for line in rawf:
      info  = line.strip().split(',')
      content = ','.join(info[1:])
      lbl = info[0]
      if lbl.find('__label__') < 0:
        #print "Error found in unlabeled data line: ", content, " \n Org line:\n", line
        continue
      if len(content.strip()) < 1:
        #print "Error found in unlabeled data line: ", content, " \n Org line:\n", line
        continue
      unLabeledData.append( content )
      unLabeledDataOrgMap[content] = lbl

  print 'Total labeled data samples: ', len(initLabelSamples), ' , unlabeledData: ' ,  len(unLabeledData)


  trainCorpus  = initLabelSamples
  trainFile = os.path.join(workdir, 'ActiveLearnTrainCorpus' )
  model = os.path.join(workdir, 'ActiveLearn.model' )
  testfile = os.path.join(workdir, '../yahoo_answers.test')
  modelPerformance  =  []

  print 'Initializaing model ...'
  writeFastTextCorpus(trainCorpus, trainFile)
  curModelPrecision = trainModel( trainFile, workdir, tooldir )
  modelPerformance.append(curModelPrecision)


  #for enrichCount in [100000, 200000, 400000, 600000, 800000, 1000000, 1200000]:
  setCounts = [1235000, 1117000, 1105000, 1040000, 910000, 780000, 650000, 520000, 390000]
  for enrichCount in setCounts:
    print '\n\n\nActive-Learning  iterating with enriching ', enrichCount , ' samples ....'
    # enrich corpus
    viewSamples = enrichingCorpus( model , unLabeledData, unLabeledDataOrgMap, enrichCount )
    trainCorpus = initLabelSamples + viewSamples

    writeFastTextCorpus(trainCorpus, trainFile)
    curModelPrecision = trainModel(trainFile, workdir, tooldir )
    modelPerformance.append( ( enrichCount, curModelPrecision ) )
    print " with enriching ", enrichCount, "samples, accuracy is:", curModelPrecision

  #summarize performance
  print 'Interation Performances are: ', str(modelPerformance)
  with codecs.open(os.path.normpath(workdir + '/AccuracyReport.activeLearningBaseline.txt'), 'w', 'utf-8') as outf:
    outf.write("\n\nActive Learning Training Logs is:\n " + str(modelPerformance) + '\n')


def enrichingCorpus( modelFile , unlabeledData, unLabeledDataOrgMap, enrichCount ):
  # active learning  is based on classifier's confidence level
  ENRICH_COUNTS = int(enrichCount) / 2
  classifier = fasttext.load_model( modelFile + '.bin', label_prefix='__label__')
  print 'enriching active learning corpus data ...'
  predictResult = []
  for sample in unlabeledData:
    if len(sample.strip()) < 1:
      print 'Error during enriching corpus!!!\n Sample:\n', sample
      continue
    result = classifier.predict_proba([sample], k=10)[0]
    predCat = result[0][0]
    meanConf = numpy.mean(zip(*result)[1])
    predictResult.append((sample, meanConf))

  sortedSamples = sorted( list(predictResult), key=lambda x: x[1], reverse=True )
  selectedHighConf = sortedSamples[:ENRICH_COUNTS]
  selectedLowConf = sortedSamples[-ENRICH_COUNTS:]
  highConfSamples = zip(*selectedHighConf)[0]
  lowConfSamples = zip(*selectedLowConf)[0]

  viewSamples=[]
  for entry in highConfSamples + lowConfSamples:
    viewSamples.append( unLabeledDataOrgMap[entry]  + '\t , ' + entry + '\n')
  return viewSamples



def trainModel(corpusFile, workdir,  tooldir ):
  modelfile = os.path.join(workdir, 'ActiveLearn.model' )
  testfile = os.path.join(workdir, '../yahoo_answers.test')

  # set params
  dim = "10"
  lr =  "0.1"
  epoch = "5"
  minCount = "1"
  word_ngrams = "2"
  bucket = "10000000"
  loss = "softmax"

  # calculate Slp and Rank score
  cmdout = run_command( tooldir + '/fasttext ' + " supervised  -input  " + corpusFile + "  -output  " + modelfile \
                + " -dim  " + dim + " -lr  " + lr + " -epoch  " + epoch + " -minCount  " + minCount + " -bucket " + bucket \
                + " -thread 1 -minn 0  -maxn 0  " + " -loss " + loss + " -wordNgrams " + word_ngrams )

  # Train the classifier
  classifier = fasttext.load_model( modelfile + '.bin', label_prefix='__label__')

  # Test the classifier
  result = classifier.test(testfile)
  print "Current round model precison: ", result.precision, " with model: ", os.path.basename(modelfile)
  return result.precision



if __name__ == '__main__':
  if len(sys.argv) != 3:
    print " usage: python activelearning.py WorkDir ToolDir"
    exit(1)
  ActiveLearn(sys.argv[1], sys.argv[2])


