import os
import json
from datetime import datetime
from PushshiftDumps.scripts.filter_file import read_lines_zst
from path_manager import PathManager
import pandas as pd


def get_amount_of_lines(file_path):
    return len(list(read_lines_zst(file_path)))

def preprocessing(current_file_name, current_path):
    file_size = os.stat(current_path).st_size
    file_lines = 0
    created = None
    bad_lines = 0
    output_path = os.path.join(PathManager.get_preprocessed_data_path(), current_file_name.replace(".zst", ".csv"))
    data = []
    for line, file_bytes_processed in read_lines_zst(current_path):
        file_lines += 1
        if file_lines % 100000 == 0:
            print(
                f"{created} Line: {file_lines:,} Bad Lines: {bad_lines:,} Bytes Processed: {file_bytes_processed:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
        try:
            obj = json.loads(line)
            created = datetime.utcfromtimestamp(int(obj["created_utc"])).strftime("%Y/%m/%d")
            data.append(obj)

        except (KeyError, json.JSONDecodeError) as err:
            print("Error:" + err)
            bad_lines += 1
    df = pd.DataFrame.from_records(data)
    df.to_csv(output_path, index=False)
    return 0


if __name__ == "__main__":
    subreddits = ["personalfinance", "financialindependence", "FinancialPlanning", "investing", "wallstreetbets",
                  "Wallstreetbetsnew", "stocks", "StockMarket", "pennystocks", "options", "RealEstate", "Economics",
                  "realestateinvesting", "AskEconomics", "explainlikeimfive"]
    submission_strings = ["submissions", "comments"]
    for subreddit in subreddits:
        for submission_string in submission_strings:
            # print(f"Starting preprocessing of subreddit {subreddit} {submission_string}")
            file_name = f"{subreddit}_{submission_string}.zst"
            path = os.path.join(PathManager.get_data_path(), file_name)
            # preprocessing(file_name, path)
            current_length = get_amount_of_lines(path)
            print(f"Subreddit {subreddit} has {current_length} {submission_string}")
            # print(f"Finished preprocessing of subreddit {subreddit} {submission_string}")