"""
Functions:
    This is to clean the training data and remove conflicting negative samples.
"""
import sys

def disambiguate(inFile, outFile):
    fdata = open(inFile)
    fout = open(outFile, 'w')
    posDict = dict() # positive sentence
    negDict = dict() # artificial sentence
    while True:
       line = fdata.readline().rstrip()
       if not line:
          break
       pos = line.find(' ')
       label = line[:pos]
       feature = line[pos+1:]
       if label == "-1":
          if feature not in negDict:
             negDict[feature] = 0
          negDict[feature] += 1
       else:
          if feature not in posDict:
             posDict[feature] = 0
          posDict[feature] += 1 
    fdata.close()
    print len(posDict)
    print len(negDict)
    posCounter = 0
    negCounter = 0
    for feature in posDict:
       fout.write("+1 {0}\n".format(feature))
       posCounter += 1
    for feature in negDict:
       if feature not in posDict:
          fout.write("-1 {0}\n".format(feature))
          negCounter += 1
    fout.close()
    print posCounter
    print negCounter

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print "{0} rawFile trainFile".format(sys.argv[0])
    else:
        disambiguate(sys.argv[1], sys.argv[2])

