__author__ = 'luchen'
###
#  multi-processing
#  for training word2vec models, remove useless \n, \t and multiple whitespaces
#  also remove non-english words
###

import re
import multiprocessing
from gensim import utils
import sys

# paras: <input text path> <output text path>
input_text_path = str(sys.argv[1])
output_text_path = str(sys.argv[2])


def read_lines(text):
    for i, line in enumerate(text):
        yield line


def clean_text(line):
    clean = re.sub(r'\n+|\t+|[^a-zA-Z0-9]+|[ ]+]', ' ', line.strip()).strip()
    if len(clean) > 0:
        return clean
    else:
        return None

if __name__ == '__main__':
    file_content = open(input_text_path)
    fout = open(output_text_path, 'w')

    processes = max(1, multiprocessing.cpu_count()-1)
    pool = multiprocessing.Pool(processes)
    texts = (line for line in read_lines(file_content))
    count = 0
    for line in utils.chunkize(texts, chunksize=1000*processes):
        for cleaned in pool.map(clean_text, line):
            count += 1
            if count % 100000 == 0:
                print count
            if cleaned:
                fout.write(cleaned+' ')
    pool.terminate()
    fout.close()