#########################################################################
#
#  This function will introduce controlled simulated errors into "unlabeled.train" dataset to simulate noisy user
#  feedback loops signals compared with ground truth.
#
#  Input: 100K always known labelled train set + 1.3 Million pretended unknown labelled train set
#  Output:  blurred labelset = 100K ground truth labelset + x% Randomly flipped labels unknown label set.
#           the percentage of randomly flipped wrong labels are: 5%, 10%, 15%, 20%, 30%
##########################################################################


import sys
import codecs
import random



def BlurringLabels(workdir ):

  print 'Loading labelled data...'
  initLabelSamples = []
  with codecs.open( workdir + '/yahoo_answers.labeled.train' , 'rb', 'utf-8') as trainf:
    for line in trainf:
      initLabelSamples.append( line )

  print 'Loading unlabeled data ...'
  unLabeledData = []
  with codecs.open(workdir + '/yahoo_answers.unlabeled.train', 'r', 'utf-8') as rawf:
    for line in rawf:
      info  = line.strip().split(',')
      content = ','.join(info[1:])
      lbl = info[0]
      if lbl.find('__label__') < 0:
        #print "Error found in unlabeled data line: ", content, " \n Org line:\n", line
        continue
      if len(content.strip()) < 1:
        #print "Error found in unlabeled data line: ", content, " \n Org line:\n", line
        continue
      unLabeledData.append( line )

  print 'Total labeled data samples: ', len(initLabelSamples), ' , unlabeledData: ',  len(unLabeledData)
  print 'Now start burring 5%, 10%, 15%, 20%, 30% labels to simulate noisy user feedback loop signals....'

  for eRate in [5, 10, 20, 30, 40, 50, 60, 70]:
    with codecs.open(workdir + '/yahoo_answers.blurred' + str(eRate) + '.train', 'w', 'utf-8') as outF:
      for line in initLabelSamples:
        outF.write(line)
      random.shuffle(unLabeledData)
      noflipCount = int( len(unLabeledData) * (100.0 - eRate) / 100.0 )
      for line in unLabeledData[:noflipCount]:
        outF.write(line)
      for line in unLabeledData[noflipCount:]:
        info = line.strip().split(',')
        content = ','.join(info[1:])
        lbl = int(info[0].replace('__label__', ''))
        flipLabel = random.choice( list( set(xrange(1,11,1)) - set([lbl]) ) )
        outF.write('__label__' + str(flipLabel) + ' , ' + content + '\n' )


if __name__ == '__main__':
  if len(sys.argv) != 2:
    print " usage: python blurringLabels.py WorkDir "
    exit(1)
  BlurringLabels(sys.argv[1])


