import os
import re
import sys
import codecs
import random
import fasttext
import subprocess
import math
import xgboost as xgb
import kenlm
import numpy as np
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
import collections
import time
import datetime

import shutil

import matplotlib

MISSING_VALUE = -9999

EVAL_THRESHOLD = 0.6

def run_command(cmd):
  p = subprocess.Popen(cmd, shell=True)
  sts = os.waitpid(p.pid, 0)
  return


def is_number(nbrstring):
  try:
    float(nbrstring)
    return True
  except ValueError:
    return False



def build_catreco_corpus_with_paralearn(workdir , catRecoTrainFile, blurPercentage):
  print ' Selecting catreco corpus based on parallel learning algorithm ... '
  arpaModels = loadKenLMs( workdir )
  catrecoClassifier = fasttext.load_model( workdir + '/catreco.model.bin', label_prefix='__label__')
  xgbModel = xgb.Booster({'nthread': 4})  # init model
  xgbModel.load_model( workdir + '/eval.xgb.model')  # load model

  #load manual labelled data
  initLabelSamples = []
  with codecs.open( workdir + '/../yahoo_answers.labeled.train' , 'rb', 'utf-8') as trainf:
    for line in trainf:
      initLabelSamples.append( line )

  #load gussed and filtered data from blurred DataSet:
  filterGussedData = []
  with codecs.open(workdir + '/../yahoo_answers.blurred' + blurPercentage +  '.train', 'r', 'utf-8') as rawf:
    print "Start guessing unlabeledData's labels ...."
    processedCnt = 0
    starttime = time.time()
    for line in rawf:
      info  = line.strip().split(',')
      blurredLabel = int(info[0].replace('__label__', ''))
        # system simulated observed labels with user feedback behavior
      content = ','.join(info[1:]) # remove and forget manual labels
      # guessing its label with parallel learned models
      if len(content.strip()) < 1:
        continue

      #print out progress
      processedCnt = processedCnt + 1
      if( processedCnt % 100000 == 0 ):
        print "Guessing Unlabeled Data: processed ", processedCnt, " samples, current batch used ", time.time() - starttime, " seconds"
        starttime = time.time()

      question = info[1].strip().strip(',').strip()
      answer = ','.join(info[2:]).strip().strip(',').strip()
      QA = ','.join(info[1:]).strip().strip(',').strip()
      # use current catreco model to guess label
      predCat = int(catrecoClassifier.predict_proba([QA], k=1)[0][0][0])
      # use eval model to evaluate its quality
      evalScore = predictXgbScore(xgbModel, arpaModels, catrecoClassifier, question, answer, QA, predCat)
      # based on evaluation result, and observed blurredLabels  decide how to erich training corpus
      if evalScore > EVAL_THRESHOLD and blurredLabel == predCat:
        filterGussedData.append(  '__label__' + str(predCat) + '\t' + content + '\n' )
      # elif evalScore < EVAL_THRESHOLD and blurredLabel == predCat:
      #   filterGussedData.append('__weight__' + '0.8' + '\t +'  '__label__' + str(predCat) + '\t' + content + '\n')
      # elif evalScore > EVAL_THRESHOLD and blurredLabel != predCat:
      #   filterGussedData.append('__weight__' + '0.8' + '\t' + '__label__' + str(predCat) + '\t' + content + '\n')
      else:
        continue

  print 'Total labeled data samples: ', len(initLabelSamples), ' , filteredGuessData: ' ,  len(filterGussedData)
  trainCorpus = initLabelSamples + filterGussedData
  writeFastTextCorpus(trainCorpus, catRecoTrainFile)
  return len(filterGussedData)



def predKlmlogPob(sentence, kenArpaModel):
  titlePPL = MISSING_VALUE
  titleProb = MISSING_VALUE
  if len(sentence.strip()) > 0:
    starttime = time.time()
    logPob = kenArpaModel.score(sentence.lower(), bos=True, eos=True)
    titleProb =  float(logPob)
    titlePPL = - 1.0 * float(logPob) / len(sentence.split())
    # print "Current KenLm prediction used ", (time.time() - starttime) * 1000, " ms "
  return titleProb, titlePPL



def loadKenLMs(workdir):
  kenArpaModels = collections.defaultdict(lambda : collections.defaultdict(lambda:''))
  for datatype in ['questions', 'answers', 'wholeQA']:
    for i in xrange(1,11,1):
      leafModel = workdir + '/' + datatype + '.' + str(i) + '.slm'
      if os.path.exists(leafModel) and (not os.path.isdir(leafModel) ):
        kenArpaModels[datatype][i] = kenlm.LanguageModel(leafModel)
  print "\n\n Loaded arpa SLM models ...\n"
  return kenArpaModels




def predictXgbScore(xgbModel, arpaModels, catrecoClassifier, question, answer, QA, candiCat):
  catrecoScores, catrecoRanks, questionSlmScores, questionNormSlmScores, questionRanks, \
  answerSlmScores, answerNormSlmScores, answerRanks, \
  QASlmScores, QANormSlmScores, QARanks = getEvalXgbFeatures(question, answer, QA, arpaModels, catrecoClassifier)
  fdata = [ [float(catrecoScores[candiCat]), float(catrecoRanks[candiCat]), \
      float(questionSlmScores[candiCat]), float(questionNormSlmScores[candiCat]), float(questionRanks[candiCat]), \
     float(answerSlmScores[candiCat]), float(answerNormSlmScores[candiCat]), float(answerRanks[candiCat]), \
    float(QASlmScores[candiCat]), float(QANormSlmScores[candiCat]), float(QARanks[candiCat]) ]  ]
  fArry = np.array(fdata)
  testData = xgb.DMatrix(fArry, missing=-9999)
  preds = xgbModel.predict(testData)
  return  preds[0]


def testXgbModel(testFile, xgbModelFile):
  threshold = EVAL_THRESHOLD
  #load XGBoost model
  bst = xgb.Booster({'nthread': 4})  # init model
  bst.load_model(xgbModelFile)  # load model
  #load testfile
  fdata = []  # feature data
  ldata = []  # response data
  for line in open (testFile, 'rb'):
    vList = line.strip().split('\t')
    try:
      fdata.append([float(ele) for ele in vList[1:]])
      ldata.append([float(vList[0])])
    except ValueError:
      continue
  fArry = np.array(fdata)
  lArry = np.array(ldata)
  testData = xgb.DMatrix(fArry, label=lArry, missing=-9999)
  #compute accuracy
  preds = bst.predict(testData)
  trueLabel = ldata
  predLabel = []
  for ele in preds:
    if ele > threshold:
      predLabel.append(1)
    else:
      predLabel.append(0)
  f1 = f1_score(trueLabel, predLabel)
  recall = recall_score(trueLabel, predLabel)
  prec = precision_score(trueLabel, predLabel)
  accuracy = accuracy_score(trueLabel, predLabel)
  print "\n\n\nXGBoost evaluation accuracy, prec,recall,f1: ", accuracy, prec, recall, f1, '\n\n'
  #save prediction for comparision purpose:
  with codecs.open(testFile + '.ComparedPred', 'w', 'utf-8') as outF:
    for i in xrange(len(trueLabel)):
      outF.write( str(trueLabel[i]) + '\t' + str(predLabel[i]) + '\t' + str(preds[i]) )
      for ele in fdata[i]:
        outF.write( '\t' + str(ele) )
      outF.write('\n')

  return accuracy, prec, recall, f1



def trainXgbModel(corpusFile, xgbModelFile ):
  print 'Start training xgboost model for evaluation module ...'
  fdata = []  # feature data
  ldata = []  # response data
  for line in open (corpusFile, 'rb'):
    vList = line.strip().split('\t')
    ldata.append([float(vList[0])])
    fdata.append([float(ele) for ele in vList[1:]])
  fArry = np.array(fdata)
  lArry = np.array(ldata)
  trainData = xgb.DMatrix(fArry, label=lArry, missing=-9999)
  param = {'max_depth': 6, 'eta': 0.3, 'silent':1, 'objective':'binary:logistic'}
  num_round = 200
  bst = xgb.train(param, trainData, num_round)
  print 'Done of xgboost model training to : ', xgbModelFile
  bst.save_model(xgbModelFile)
  bst.dump_model(xgbModelFile + '.dump.txt')
  print 'Plotting out feature importance ...'
  # print(bst.feature_importances_)
  #xgb.plot_importance(bst)




def getEvalXgbFeatures(question, answer, QA, arpaModels, catrecoClassifier ):

  catrecoScores, catrecoRanks = {}, {}
  questionSlmScores, questionNormSlmScores, questionRanks = {}, {}, {}
  answerSlmScores, answerNormSlmScores, answerRanks = {}, {}, {}
  QASlmScores, QANormSlmScores, QARanks = {}, {}, {}

  # ============compute question SLM  features
  for idx in xrange(1,11,1):
    questionSlmScores[idx], curPPL = predKlmlogPob( question.lower(), arpaModels['questions'][idx] )
  maxQuestionSlmScore = max(questionSlmScores.values())
  for idx in xrange(1,11,1):
    questionNormSlmScores[idx] = 1.0 / ( 1 + maxQuestionSlmScore - questionSlmScores[idx] )
  ranks = sorted([(score, idx) for (idx, score) in questionSlmScores.items() ], reverse=True)
  for rank in xrange(len(ranks)):
    questionRanks[ ranks[rank][1] ] = rank
  # ============compute answers SLM  features
  for idx in xrange(1,11,1):
    answerSlmScores[idx], curPPL = predKlmlogPob( answer.lower(), arpaModels['answers'][idx] )
  maxAnswerSlmScore = max(answerSlmScores.values())
  for idx in xrange(1,11,1):
    answerNormSlmScores[idx] = 1.0 / ( 1 + maxAnswerSlmScore - answerSlmScores[idx] )
  ranks = sorted([(score, idx) for (idx, score) in answerSlmScores.items() ], reverse=True)
  for rank in xrange(len(ranks)):
    answerRanks[ ranks[rank][1] ] = rank
  # ============compute QA  features
  for idx in xrange(1,11,1):
    QASlmScores[idx], curPPL = predKlmlogPob( QA.lower(), arpaModels['wholeQA'][idx] )
  maxQASlmScore = max(QASlmScores.values())
  for idx in xrange(1,11,1):
    QANormSlmScores[idx] = 1.0 / ( 1 + maxQASlmScore - QASlmScores[idx] )
  ranks = sorted([(score, idx) for (idx, score) in QASlmScores.items() ], reverse=True)
  for rank in xrange(len(ranks)):
    QARanks[ ranks[rank][1] ] = rank
  #=======compute catreco features: TODO: should we also compute question and answer sperately???
  catrecoresult = catrecoClassifier.predict_proba([QA], k=10)[0]
  for (label, score) in catrecoresult:
    catrecoScores[int(label)] = float(score)
  ranks = sorted([(score, idx) for (idx, score) in catrecoScores.items() ], reverse=True)
  for rank in xrange(len(ranks)):
    catrecoRanks[ ranks[rank][1] ] = rank

  return  catrecoScores, catrecoRanks, questionSlmScores, questionNormSlmScores, questionRanks, \
          answerSlmScores, answerNormSlmScores, answerRanks, QASlmScores, QANormSlmScores, QARanks


def train_eval_model(workdir, evalModelCorpusFile, evalXgbModelFile, evalModelTestFile):

  catRecoModelFile = os.path.join(workdir, 'catreco.model')
  arpaModels = loadKenLMs( workdir )
  catrecoClassifier = fasttext.load_model(catRecoModelFile + '.bin', label_prefix='__label__')

  with codecs.open(evalModelTestFile, 'w', 'utf-8') as testF, \
       codecs.open(evalModelCorpusFile, 'w', 'utf-8') as corpusF, \
       codecs.open(workdir + '/../yahoo_answers.labeled.train', 'r', 'utf-8' ) as rawLabelF, \
       codecs.open(workdir + '/../yahoo_answers.test', 'r', 'utf-8') as rawTestF:

    #get xgboost training corpus
    processedCnt=0
    starttime = time.time()
    for line in rawLabelF:
      info  = line.strip().split(',')
      content = ','.join(info[1:])
      lbl = info[0]
      if lbl.find('__label__') < 0:
        continue
      if len(content.strip()) < 1:
        continue
      labelCat = int(lbl.replace('__label__', ''))
      question = info[1].strip().strip(',').strip()
      answer = ','.join(info[2:]).strip().strip(',').strip()
      QA = ','.join(info[1:]).strip().strip(',').strip()
      #sanity check question, answer, QA:
      if  len(QA) < 1:
        print "Error in line:", line
        continue
      #print out progress
      processedCnt = processedCnt + 1
      if( processedCnt % 50000 == 0 ):
        print "Training Eval Model: processed ", processedCnt, " samples, current batch used ", time.time() - starttime, " seconds"
        starttime = time.time()
      #compute some feature set
      catrecoScores, catrecoRanks, questionSlmScores, questionNormSlmScores, questionRanks, \
      answerSlmScores, answerNormSlmScores, answerRanks, \
      QASlmScores, QANormSlmScores, QARanks  = getEvalXgbFeatures(  question, answer, QA,  arpaModels, catrecoClassifier )
      #write final features for postive sample:
      corpusF.write( '1' + '\t' + str(catrecoScores[labelCat]) + '\t' +  str(catrecoRanks[labelCat]) + '\t'  )
      corpusF.write( str(questionSlmScores[labelCat])  + '\t' + str(questionNormSlmScores[labelCat]) + '\t' +  str(questionRanks[labelCat]) + '\t' )
      corpusF.write( str(answerSlmScores[labelCat])  + '\t' + str(answerNormSlmScores[labelCat]) + '\t' +  str(answerRanks[labelCat]) + '\t' )
      corpusF.write( str(QASlmScores[labelCat])  + '\t' + str(QANormSlmScores[labelCat]) + '\t' +  str(QARanks[labelCat]) + '\n' )
      # get negative samples
      negCats = list( set(xrange(1,11,1)) - set([labelCat]) )
      random.shuffle(negCats)
      #write final feature for neg samples: one single random ng sample for train corpus,
      ns = negCats[0]
      corpusF.write( '0' + '\t' + str(catrecoScores[ns]) + '\t' +  str(catrecoRanks[ns]) + '\t'  )
      corpusF.write( str(questionSlmScores[ns])  + '\t' + str(questionNormSlmScores[ns]) + '\t' +  str(questionRanks[ns]) + '\t' )
      corpusF.write( str(answerSlmScores[ns])  + '\t' + str(answerNormSlmScores[ns]) + '\t' +  str(answerRanks[ns]) + '\t' )
      corpusF.write( str(QASlmScores[ns])  + '\t' + str(QANormSlmScores[ns]) + '\t' +  str(QARanks[ns]) + '\n' )

      #get xgb testing corpus
      processedCnt = 0
      starttime = time.time()
      for line in rawTestF:
        info = line.strip().split(',')
        content = ','.join(info[1:])
        lbl = info[0]
        if lbl.find('__label__') < 0:
          continue
        if len(content.strip()) < 1:
          continue
        labelCat = int(lbl.replace('__label__', ''))
        question = info[1].strip().strip(',').strip()
        answer = ','.join(info[2:]).strip().strip(',').strip()
        QA = ','.join(info[1:]).strip().strip(',').strip()
        # sanity check question, answer, QA:
        if len(QA) < 1:
          print "Error in line:", line
          continue
        # print out progress
        processedCnt = processedCnt + 1
        if (processedCnt % 100000 == 0):
          print "Testing Eval Model: processed ", processedCnt, " samples, current batch used ", time.time() - starttime, " seconds"
          starttime = time.time()
        # compute some feature set
        catrecoScores, catrecoRanks, questionSlmScores, questionNormSlmScores, questionRanks, \
        answerSlmScores, answerNormSlmScores, answerRanks, \
        QASlmScores, QANormSlmScores, QARanks = getEvalXgbFeatures(question, answer, QA, arpaModels, catrecoClassifier)
        # write final features for postive sample
        testF.write('1' + '\t' + str(catrecoScores[labelCat]) + '\t' + str(catrecoRanks[labelCat]) + '\t')
        testF.write(str(questionSlmScores[labelCat]) + '\t' + str(questionNormSlmScores[labelCat]) + '\t' + str(
          questionRanks[labelCat]) + '\t')
        testF.write(str(answerSlmScores[labelCat]) + '\t' + str(answerNormSlmScores[labelCat]) + '\t' + str(
          answerRanks[labelCat]) + '\t')
        testF.write(
          str(QASlmScores[labelCat]) + '\t' + str(QANormSlmScores[labelCat]) + '\t' + str(QARanks[labelCat]) + '\n')
        # get negative samples
        negCats = list(set(xrange(1, 11, 1)) - set([labelCat]))
        random.shuffle(negCats)
        # write final feature for neg samples
        ns = negCats[0]
        testF.write('0' + '\t' + str(catrecoScores[ns]) + '\t' + str(catrecoRanks[ns]) + '\t')
        testF.write(
          str(questionSlmScores[ns]) + '\t' + str(questionNormSlmScores[ns]) + '\t' + str(questionRanks[ns]) + '\t')
        testF.write(
          str(answerSlmScores[ns]) + '\t' + str(answerNormSlmScores[ns]) + '\t' + str(answerRanks[ns]) + '\t')
        testF.write(str(QASlmScores[ns]) + '\t' + str(QANormSlmScores[ns]) + '\t' + str(QARanks[ns]) + '\n')

  # train and test xgboost models here
  trainXgbModel(evalModelCorpusFile, evalXgbModelFile)
  evalPrecision = testXgbModel(evalModelTestFile, evalXgbModelFile)

  return evalPrecision



def paraLearningBlurVer(workdir, toolDir, blurPercentage):

  #setup intermidate files needed during training process
  evalModelCorpusFile = os.path.join(workdir, 'eval.xgb.TrainCorpus')
  evalXgbModelFile = os.path.join(workdir, 'eval.xgb.model')
  evalModelTestFile = os.path.join(workdir, 'eval.xgb.TestFile')
  catRecoTrainFile = os.path.join(workdir, 'catreco.TrainCorpus' )

  curPrecsion = trainCatRecoModel( workdir + '/../yahoo_answers.labeled.train' , workdir, toolDir  )
  catrecoAccuracy, evalAccuracy, enrichedCount = [], [], []
  catrecoAccuracy.append(curPrecsion)
  print 'Initial Catreco model accuracy is: ', curPrecsion

  for itr in xrange(20):
    print '\n\nParallel  Learning iterating #', itr, ' ....'

    print '\nFirst step: prepare eval model training corpus and train eval models'
    accuracy, prec, recall, f1  = train_eval_model(workdir, evalModelCorpusFile, evalXgbModelFile, evalModelTestFile  )
    evalAccuracy.append( (accuracy, prec, recall, f1) )

    print '\nSecond step: select new catreco corpus, and train out new catreco model.'
    curEnrichedCount = build_catreco_corpus_with_paralearn(workdir, catRecoTrainFile, blurPercentage)
    curPrecsion = trainCatRecoModel(catRecoTrainFile, workdir , toolDir)
    catrecoAccuracy.append(curPrecsion)
    enrichedCount.append(curEnrichedCount)

  #summarize performance
  print 'Iteration Classification Model Performances are: ', str(catrecoAccuracy)
  print 'Iteration Evaluation Model Performances (accuracy, prec, recall, f1) are: ', str((accuracy, prec, recall, f1))
  print 'Iteration Enriched corpus acount:', curEnrichedCount

  with codecs.open(os.path.normpath(workdir + '/AccuracyReport.ParaLearning.txt'), 'w', 'utf-8') as outf:
    outf.write("\n\nParaLearning Learning Exp Logs is:\n CatReco Perf:\n" + str(catrecoAccuracy) + '\n' )
    outf.write("\nEval Model Perf (accuracy, prec, recall, f1) :\n" + str(evalAccuracy) + '\n' )
    outf.write("\nEnriched corpus account: \n " + str(enrichedCount) + '\n' )


def writeFastTextCorpus(data, filename):
  random.shuffle(data)
  with codecs.open(filename, 'w', 'utf-8') as corpusF:
    for sample in data:
      corpusF.write(sample)



def trainCatRecoModel(corpusFile, workdir,  tooldir ):
  modelfile = os.path.join(workdir, 'catreco.model' )
  testfile = os.path.join(workdir, '../yahoo_answers.test')

  # set params
  dim = "10"
  lr =  "0.1"
  epoch = "5"
  minCount = "1"
  word_ngrams = "2"
  bucket = "10000000"
  loss = "softmax"

  # calculate Slp and Rank score
  cmdout = run_command( tooldir + '/fasttext ' + " supervised  -input  " + corpusFile + "  -output  " + modelfile \
                + " -dim  " + dim + " -lr  " + lr + " -epoch  " + epoch + " -minCount  " + minCount + " -bucket " + bucket \
                + " -thread 1 -minn 0  -maxn 0  " + " -loss " + loss + " -wordNgrams " + word_ngrams )

  # Train the classifier
  classifier = fasttext.load_model( modelfile + '.bin', label_prefix='__label__')

  # Test the classifier
  result = classifier.test(testfile)
  print "Current round model precison: ", result.precision, " with model: ", os.path.basename(modelfile)
  return result.precision



if __name__ == '__main__':
  if len(sys.argv) != 4:
    print " usage: python paraLearningBlurVer.py WorkDir ToolDir BlurPercentage"
    exit(1)
  paraLearningBlurVer( sys.argv[1] , sys.argv[2], sys.argv[3] )
