# -*- coding:utf-8 -*-
import sys, os, time 
import argparse
import json

def write_subs(out_dir, out_dict):
    """
        save the obtained subs into files,
        out_dict: { subreddit_id: submission list }
    """
    sub_ids = out_dict.keys()
    for sub_id in sub_ids:
        submissions = out_dict[sub_id]
        try:
            fout = open( os.path.join(out_dir, sub_id), 'a' )
        except:
            fout = open( os.path.join(out_dir, sub_id), 'w' )
        for submission in submissions:
            fout.write('{} {} {} {} {} {} {}\n'.format(submission[0], submission[1], submission[2], submission[3],
                                                       submission[4], submission[5], submission[6]))
        fout.flush()
        fout.close()
    pass

def extract_submission(input_file, out_dir):
    """
        extract submission from the source file in json format.
        output,
        name, title, num_comments, subreddit_id, score, ups, downs, [over_18, author,]
    """
    with open(input_file, 'r') as fin:
        subreddit_dict = {}
        line_no = 0
        for line in fin:
            submission_dict = json.loads(line)
            try:
                name = submission_dict[u'name']
                title = submission_dict[u'title']
                num_comments = submission_dict[u'num_comments']
                score = submission_dict[u'score']
                ups = submission_dict[u'ups']
                downs = submission_dict[u'downs']
                subreddit_id = submission_dict[u'subreddit_id']
            except:
                continue
            # over_18 = submission_dict[u'over_18']
            try:
                subreddit_dict[subreddit_id].append((name, title, num_comments, score, 
                                                      ups, downs, subreddit_id))
            except:
                subreddit_dict[subreddit_id] = []
                subreddit_dict[subreddit_id].append((name, title, num_comments, score, 
                                                      ups, downs, subreddit_id))
            # 
            line_no += 1
            if line_no % 500000 == 0:
                print('Processing:{}'.format(line_no))
                write_subs(out_dir, subreddit_dict)
                subreddit_dict = {}
            # 
        # 
        if len(subreddit_dict) > 0:
        	write_subs(out_dir, subreddit_dict)
    # end of file


parser = argparse.ArgumentParser(description='Extraction for Reddit Submissions')
# Path Arguments
parser.add_argument('--input_file', type=str, required=True,
                    help='location of the data corpus')
parser.add_argument('--out_dir', type=str, required=True,
                    help='location of the data corpus')

args = parser.parse_args()
print(vars(args))

extract_submission(args.input_file, args.out_dir)