from PushshiftDumps.scripts.filter_file import read_lines_zst
import os
from path_manager import PathManager
import json
import random


def merge_datasets(file_path, data):
    file_size = os.stat(file_path).st_size
    file_lines = 0
    bad_lines = 0

    for line, file_bytes_processed in read_lines_zst(file_path):
        file_lines += 1
        if file_lines % 100000 == 0:
            print(
                f"Line: {file_lines:,} Bad Lines: {bad_lines:,} Bytes Processed: {file_bytes_processed:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
        try:
            obj = json.loads(line)
            data.append(obj)

        except (KeyError, json.JSONDecodeError) as err:
            print("Error:" + err)
    print(f"Subreddit {subreddit} has {file_lines} entries")
    return data


if __name__ == "__main__":
    subreddits = ["personalfinance", "financialindependence", "FinancialPlanning", "investing", "wallstreetbets",
                  "Wallstreetbetsnew", "stocks", "StockMarket", "pennystocks", "options", "RealEstate", "Economics",
                  "realestateinvesting", "AskEconomics", "explainlikeimfive"]
    path = os.path.join(PathManager.get_question_answers_path(), "updated_filters_no_filter_comments_good_bad")
    # TODO: Adapt to handle all available QA sets
    for directory in os.listdir(path):
        if directory == ".DS_Store":
            continue
        result_data = []
        output_path = os.path.join(path, directory, f"complete_qa.json")
        for subreddit in subreddits:
            current_path = os.path.join(path, directory, f"{subreddit}_qa_max_level_0.zst")
            result_data = merge_datasets(current_path, result_data)
            random.Random(42).shuffle(result_data)
        print(f"QA Dataset for {directory} has {len(result_data)} entries.")
        with open(output_path, "w") as f:
            json.dump(result_data, f)
