# -*- coding:utf-8 -*-
"""
    Extracting the direct comments in the hierarchial structure thread
    Title
    c1
        not c11
    c2
    c3  
"""
import sys, os, time 
import argparse
import json

def write_comments(out_dir, out_dict):
    """
        save the obtained subs into files,
        out_dict: { subreddit_id: submission list }
    """
    code_error = 0
    sub_ids = out_dict.keys()
    for sub_id in sub_ids:
        submissions = out_dict[sub_id]
        try:
            fout = open( os.path.join(out_dir, sub_id), 'a' )
        except:
            fout = open( os.path.join(out_dir, sub_id), 'w' )
        for submission in submissions:
            try:
                fout.write('{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n'.format(submission[0], submission[1], submission[2], submission[3],
                                                       submission[4], submission[5], submission[6], submission[7]))
            except:
                code_error += 1
        fout.flush()
        fout.close()
    return code_error

def extract_directcommets(input_file, out_dir):
    """
    """
    code_error_t = 0
    # with open(input_file, 'r') as fin:
    with open(input_file, 'r', encoding='utf-8') as fin:
        subreddit_dict = {}
        line_no = 0
        for line in fin:
            line_no += 1
            if line_no % 1000000 == 0:
                print('Processing:{}'.format(line_no))
                code_error_t += write_comments(out_dir, subreddit_dict)
                subreddit_dict = {}
            comment_dict = json.loads(line)
            try:
                subreddit_id = comment_dict['subreddit_id']
                name = comment_dict['name']
                parent_id = comment_dict['parent_id']
                link_id = comment_dict['link_id']
                body = comment_dict['body'].replace('\n', ' ').replace('\t', ' ')
                score = comment_dict['score']
                ups = comment_dict['ups']
                downs = comment_dict['downs']
            except:
                continue
            # 
            word_num = len(body.strip().split())
            # print(word_num)
            if word_num < 5 or word_num > 200:
                continue
            if parent_id == link_id:
                try:
                    subreddit_dict[subreddit_id].append([name, parent_id, link_id, body, score, ups, downs, subreddit_id])
                except:
                    subreddit_dict[subreddit_id] = []
                    subreddit_dict[subreddit_id].append([name, parent_id, link_id, body, score, ups, downs, subreddit_id])
            else:
                continue
            # 
        # 
        if len(subreddit_dict) > 0:
            code_error_t += write_comments(out_dir, subreddit_dict)
        # 
    # end of preprocess
    print(code_error_t)

parser = argparse.ArgumentParser()
parser.add_argument('--input_file', type=str, required=True,
                    help='input path')
parser.add_argument('--out_dir', type=str, required=True,
                    help='output directory')
args = parser.parse_args()
print(vars(args))
start_time = time.time()
extract_directcommets(args.input_file, args.out_dir)
print('Time:', time.time()-start_time)
