import pandas as pd
from tqdm import tqdm
import argparse
import json
from functools import partial

def read_jsonlist(path, filters=[]):
    with open(path, encoding="utf8") as f:
        for ln in tqdm(f):
            if not ln:
                continue
            if ln.strip() == '':
                continue
            try:
                di = json.loads(ln.strip())
                if sum([not filt(di) for filt in filters]) > 0:
                    del di
                    continue
                yield di
            except:
                print('Encountered an error. Skipping a line.')
                continue


def score_filter(di, low=-1, high=5):
    return (di['score'] >= high) or (di['score'] < low)

def sticky_filter(di):
    return ~di['stickied']

def time_filter(post, comment, thresh=60*60*10):
    return (comment['created_utc'] - post['created_utc']) <= thresh

def filter_subreddit(di, subreddits):
    return di.get('subreddit', '').lower() in subreddits

def filter_posts(di, posts):
    return di['link_id'] in posts

def filter_top_level_comments(di):
    return di['parent_id'].startswith('t3_')


def main():
    # Parse command-line arguments
    parser = argparse.ArgumentParser(description='Load a pushshift submissions file and filter by subreddits')
    parser.add_argument('--input-file', required=True, help='Path to the input file where every line is a JSON object')
    parser.add_argument('--subreddit-file', required=True, help='Path to the JSON file containing the list of subreddits to retain')
    parser.add_argument('--output-file', required=True, help='Path to the output JSON file')
    args = parser.parse_args()

    subreddit_file = args.subreddit_file
    with open(subreddit_file, 'r') as f:
        subreddit_list = set(json.load(f))

    input_file = args.input_file
    df = pd.DataFrame(list(read_jsonlist(input_file, filters=[sticky_filter, partial(filter_subreddit, subreddits=subreddit_list)])))

    output_file = args.output_file
    df.to_json(output_file)

if __name__ == '__main__':
    main()

# Example usage: python read_submissions.py --input-file /mnt/d/reddit/submissions/RS_2018-02 --subreddit-file emb_psr_subs.json --output-file /mnt/e/reddit/2018-02-out.json