from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import spacy
import pandas as pd
import json
import os
from datetime import datetime
import zstandard
from tqdm import tqdm

if __name__ == "__main__":
    subreddits = ["personalfinance", "financialindependence", "FinancialPlanning", "investing", "wallstreetbets",
                  "Wallstreetbetsnew", "stocks", "StockMarket", "pennystocks", "options", "RealEstate", "Economics",
                  "realestateinvesting", "AskEconomics", "explainlikeimfive"]
    nlp = spacy.load('en_core_web_sm')
    with open("./data/final_data_tokenized_filter.json", "r") as inputfile:
        dataset = json.load(inputfile)
    result = {}
    for el in tqdm(dataset):
        cur_subreddit = el["subreddit"]
        cur_text = el["text"]
        if cur_subreddit not in result:
            result[cur_subreddit] = cur_text
        else:
            result[cur_subreddit] = result[cur_subreddit] + " " + cur_text
    all_titles = []
    for subreddit in subreddits:
        all_titles.append(result[subreddit])
    df = pd.DataFrame({"subreddits": subreddits, "texts": all_titles})
    df.set_index("subreddits", inplace=True)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_separate = tfidf_vectorizer.fit_transform(df["texts"])

    df_tfidf = pd.DataFrame(
        tfidf_separate.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index
    )
    columns = df_tfidf.columns.tolist()
    columns_combined = " ".join(columns)
    columns_spacy = nlp(columns_combined)
    new_columns = [token for token in columns_spacy if (token.pos_ == "NOUN" and len(token) > 3)]# or token.pos_ == "VERB"]
    overall_columns = [str(token) for token in new_columns if str(token) in columns]
    new_df_tfidf = df_tfidf[overall_columns].copy()
    new_df_tfidf.to_csv("tfidf_scores_per_subreddit_overall_filtered.csv")