from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import nltk
import spacy
import pandas as pd
import json
import os
from path_manager import PathManager
from PushshiftDumps.scripts.filter_file import read_lines_zst
from datetime import datetime
from analysis import get_basic_attribute
# def get_tfidf_top_features(documents,n_top=10):
#   tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,  stop_words='english')
#   tfidf = tfidf_vectorizer.fit_transform(documents)
#   importance = np.argsort(np.asarray(tfidf.sum(axis=0)).ravel())[::-1]
#   tfidf_feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
#   return tfidf_feature_names[importance[:n_top]]

# corpus = [
#     'I would like to check this document',
#     'How about one more document',
#     'Aim is to capture the key words from the corpus',
#     'frequency of words in a document is called term frequency'
# ]

# x = get_tfidf_top_features(corpus, 3)
# nlp = spacy.load("en_core_web_sm")

# doc = nlp(corpus[0])

# nouns = [token.lemma_ for token in doc if token.pos_ == "NOUN"]
# verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]

# y = 4


# mylist = [
#     'This is the first document.',
#     'This document is the second document.',
#     'And this is the third one.',
#     'Is this the first document?',
#     'Is this the second cow?, why is it blue?',
# ]
# df = pd.DataFrame({"texts": mylist})
# tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
# tfidf_separate = tfidf_vectorizer.fit_transform(df["texts"])

# df_tfidf = pd.DataFrame(
#     tfidf_separate.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index
# )
# x =1

def preprocessing(current_path):
    file_size = os.stat(current_path).st_size
    file_lines = 0
    created = None
    bad_lines = 0
    combined_titles = ""
    for line, file_bytes_processed in read_lines_zst(current_path):
        if file_lines == 2000:
          break
        file_lines += 1
        if file_lines % 1000 == 0:
            print(
                f"{created} Line: {file_lines:,} Bad Lines: {bad_lines:,} Bytes Processed: {file_bytes_processed:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
        try:
            obj = json.loads(line)
            created = datetime.utcfromtimestamp(int(obj["created_utc"])).strftime("%Y/%m/%d")
            new_title = get_basic_attribute(obj, "title").lower()
            spacy_title = nlp(new_title)
            lemmatized_tokens = [token.lemma_ for token in spacy_title]
            lemmatized_text = ' '.join(lemmatized_tokens)
            if combined_titles == "":
              combined_titles = lemmatized_text
            else:
              combined_titles += " " + lemmatized_text

        except (KeyError, json.JSONDecodeError) as err:
            print("Error:" + err)
            bad_lines += 1
    print("Got subreddit title combination!")
    return combined_titles


if __name__ == "__main__":
    subreddits = ["personalfinance", "financialindependence", "FinancialPlanning", "investing", "wallstreetbets",
                  "Wallstreetbetsnew", "stocks", "StockMarket", "pennystocks", "options", "RealEstate", "Economics",
                  "realestateinvesting", "AskEconomics", "explainlikeimfive"]
    all_titles = []
    # subreddits = ["AskEconomics", "realestateinvesting"]
    nlp = spacy.load('en_core_web_sm')
    for subreddit in subreddits:
      print(f"Starting preprocessing of subreddit {subreddit}")
      file_name = f"{subreddit}_submissions.zst"
      path = os.path.join(PathManager.get_data_path(), file_name)
      all_titles.append(preprocessing(path))
    df = pd.DataFrame({"subreddits": subreddits, "texts": all_titles})
    df.set_index("subreddits", inplace=True)
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
    tfidf_separate = tfidf_vectorizer.fit_transform(df["texts"])

    df_tfidf = pd.DataFrame(
        tfidf_separate.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index
    )
    columns = df_tfidf.columns.tolist()
    columns_combined = " ".join(columns)
    columns_spacy = nlp(columns_combined)
    new_columns = [token for token in columns_spacy if (token.pos_ == "NOUN" and len(token) > 3)]# or token.pos_ == "VERB"]
    overall_columns = [str(token) for token in new_columns if str(token) in columns]
    new_df_tfidf = df_tfidf[overall_columns].copy()
    new_df_tfidf.to_csv("tfidf_scores_per_subreddit_new.csv")