__author__ = 'luchen'
import os
import re
import json


RE_P0 = re.compile('(RT @)(\w+)(:)', re.UNICODE)  # retweet tag
RE_P1 = re.compile('(@)(\w+)', re.UNICODE)  # user mention
RE_P2 = re.compile('(#)(\w+)', re.UNICODE)  # hashtag
RE_P3 = re.compile('(\w+):\/\/([^\s]*?)', re.UNICODE)  # URL
RE_P4 = re.compile(r'&(#?)(x?)(\w+);', re.UNICODE)  # html code
RE_P5 = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', re.UNICODE)

PAT_ALPHABETIC = re.compile('((?![\d])(\w)+)', re.UNICODE)


def to_unicode(text, encoding='utf8', errors='strict'):
    if isinstance(text, unicode):
        return text
    return unicode(text, encoding, errors=errors)


def remove_markup(text):
    text = re.sub(RE_P4, ' ', text)  # remove html code
    text = re.sub(RE_P1, ' ', text)  # remove any user mention
    text = re.sub(RE_P5, ' ', text)  # remove urls
    text = re.sub(RE_P2, ' \\2', text)  # remove hash symbol of hashtags
    text = re.sub(RE_P3, ' ', text)
    text = text.replace('[', '').replace(']', '')  # promote all remaining markup to plain text
    return text


def select_en_tweet(line):
    if line.startswith('{"delete":'):
        return None, None
    try:
        tweet = json.loads(line)
    except ValueError:
        return None, None
    try:
        lang = tweet['lang']
        if lang != 'en':
            return None, None
        text = to_unicode(tweet['text'])
        if re.match(RE_P0, text):  # remove retweet tag
            return None, None
        if text == '':
            return None, None
    except:
        return None, None
    return text.encode('utf8'), remove_markup(text)



def tokenize(text):
    text = text.lower()
    for match in PAT_ALPHABETIC.finditer(text):
        yield match.group().encode('utf8').replace('_', '')


if __name__ == '__main__':
    # paras:  <tweet raw collection path> <full text out path> <cleaned text out path>
    import sys
    corpus_path = str(sys.argv[1])  # '/home/luchen/Documents/Tweet_Corpus/Dec20Jan5'
    full_text_output = str(sys.argv[2])
    cleaned_text_out_path = str(sys.argv[3])


    outp = open(full_text_output, 'w')
    outp_cleaned = open(cleaned_text_out_path, 'w')
    dates = os.listdir(corpus_path)
    for date in dates:
        if os.path.isdir(os.path.join(corpus_path, date)):
            files = os.listdir(os.path.join(corpus_path, date))
            for f in files:
                fp = open(os.path.join(corpus_path, date, f))
                for i, line in enumerate(fp):
                    content, text = select_en_tweet(line)
                    if text:
                        tokens = list(tokenize(text))
                        if len(tokens) > 1:
                            outp.write(content + '\n')
                            outp_cleaned.write(' '.join(tokens) + '\n')
                print ('finished clean %i', f)

    outp.close()
    outp_cleaned.close()