import copy

import zstandard
import os
import json
import math
import re
from datetime import datetime
from path_manager import PathManager
from PushshiftDumps.scripts.filter_file import write_line_zst, read_lines_zst
from analysis import get_basic_attribute, get_richtext
import numpy as np

alphabets= "([A-Za-z])"
prefixes = "(Mr|St|Mrs|Ms|Dr)[.]"
suffixes = "(Inc|Ltd|Jr|Sr|Co)"
starters = "(Mr|Mrs|Ms|Dr|Prof|Capt|Cpt|Lt|He\s|She\s|It\s|They\s|Their\s|Our\s|We\s|But\s|However\s|That\s|This\s|Wherever)"
acronyms = "([A-Z][.][A-Z][.](?:[A-Z][.])?)"
websites = "[.](com|net|org|io|gov|edu|me)"
digits = "([0-9])"
multiple_dots = r'\.{2,}'


def filter_score(obj, perc, min_score=2):
    score = get_basic_attribute(obj, "score")
    if score is None or score < min_score or score < perc:
        return True
    return False


def filter_upvote_ratio(obj, perc, threshold=0.5):
    upvote_ratio = get_basic_attribute(obj, "upvote_ratio")
    if upvote_ratio is not None:
        if upvote_ratio < threshold or upvote_ratio < perc:
            return True
    return False


def filter_num_comments(obj, min_comments=3):
    num_comment = get_basic_attribute(obj, "num_comments")
    if num_comment is not None:
        if num_comment < min_comments:
            return True
    return False


def filter_domain(obj):
    domain = get_basic_attribute(obj, "domain")
    if domain and "self." not in domain:
        return True
    return False


def filter_link_flair_text(obj):
    # only potentially relevant if trying to filter for questions/content on specific topic
    link_flair = get_basic_attribute(obj, "link_flair_text")
    if subreddit == "AskEconomics":
        askeconomics_valid_flairs = ["Approved Answers", "Good Question", "Simple Questions/Career"]
        if link_flair not in askeconomics_valid_flairs:
            return True
    if subreddit == "financialindependence":
        f_i_invalid_flairs = ["Mod Post", "Case Study", "Moderator Meta", "Personal Journey"]
        if link_flair in f_i_invalid_flairs:
            return True
    if subreddit == "explainlikeimfive":
        if link_flair != "Economics":
            return True
    return False


def filter_link_flair_richtext(obj):
    return False


def filter_author_flair(obj):
    author_flair_richtext = get_richtext(obj, "author_flair_richtext")
    author_flair = get_basic_attribute(obj, "author_flair_text")
    filter_list = ["Moderator", "Admin"]
    # TODO: Figure out if relevant in other subreddits
    if author_flair_richtext == "Emeritus Moderator":
        return True
    return False


# def filter_author(obj):
#     author = get_basic_attribute(obj, "author")
#     selftext = get_basic_attribute(obj, "selftext")
#     # Personal Finance specific filter for challenges
#     # TODO: Figure out if relevant in other subreddits
#     if author == "IndexBot" or author == "AutoModerator":
#         if selftext and "30-day challenge" in selftext:
#             return True
#     return False


def filter_bot_author(obj):
    # TODO: keep relevant bot postings
    author_flair = get_basic_attribute(obj, "author_flair_text")
    author = get_basic_attribute(obj, "author")
    bot_flairs = ["IndexBot", "AutoModerator", "Moderation Bot"]
    if author_flair is not None and author_flair in bot_flairs:
        return True
    if author is not None and author in bot_flairs:
        return True
    return False


def filter_content(obj, comment_as_submission=False):
    if comment_as_submission == True:
        body = get_basic_attribute(obj, "body")
        if body is None or body == "" or body == "[removed]" or body == "[deleted]":
            return True
    else:
        selftext = get_basic_attribute(obj, "selftext")
        title = get_basic_attribute(obj, "title")

        if selftext is None or title is None or (selftext == "" and title == ""):
            return True
        elif selftext == "[removed]" or selftext == "[deleted]":
            return True
        elif title == "[removed]" or title == "[deleted]":
            return True
    return False


def filter_body(obj, perc):
    body = get_basic_attribute(obj, "body")
    if body is None or body == "" or body == "[removed]" or body == "[deleted]" or len(body.split()) < 30:
        return True
    return False


def filter_stickied(obj):
    stickied = get_basic_attribute(obj, "stickied")
    if stickied:
        return True
    return False


def filter_collapsed(obj):
    collapsed = get_basic_attribute(obj, "collapsed")
    if collapsed:
        return True
    return False


def filter_gilded(obj, perc, min_gildings=1):
    gilded = get_basic_attribute(obj, "gilded")
    if gilded is not None:
        if gilded < min_gildings or gilded < perc:
            return True
    return False


def filter_awardings(obj, perc, min_awards=2):
    awards_received = get_basic_attribute(obj, "total_awards_received")
    if awards_received is not None:
        if awards_received < min_awards or awards_received < perc:
            return True
    all_awardings = get_basic_attribute(obj, "all_awardings")
    if all_awardings is not None and len(all_awardings) > 0:
        if len(all_awardings) == 1:
            if all_awardings[0]["count"] < 2:
                return True
        elif all_awardings[0]["count"] < 2 and all_awardings[1]["count"] < 2:
            return True
    return False

def filter_distinguished(obj):
    if subreddit not in ["AskEconomics", "financialindependence", "StockMarket", "options", "RealEstate", "Economics"]:
        return False
    distinguished = get_basic_attribute(obj, "distinguished")
    if distinguished == "moderator" or distinguished == "admin":
        return True
    return False


# Taken from https://stackoverflow.com/questions/4576077/how-can-i-split-a-text-into-sentences
def split_into_sentences(text: str) -> list[str]:
    """
    Split the text into sentences.

    If the text contains substrings "<prd>" or "<stop>", they would lead
    to incorrect splitting because they are used as markers for splitting.

    :param text: text to be split into sentences
    :type text: str

    :return: list of sentences
    :rtype: list[str]
    """
    text = " " + text + "  "
    text = text.replace("\n", " ")
    text = re.sub(prefixes, "\\1<prd>", text)
    text = re.sub(websites, "<prd>\\1", text)
    text = re.sub(digits + "[.]" + digits, "\\1<prd>\\2", text)
    text = re.sub(multiple_dots, lambda match: "<prd>" * len(match.group(0)) + "<stop>", text)
    if "Ph.D" in text: text = text.replace("Ph.D.", "Ph<prd>D<prd>")
    text = re.sub("\s" + alphabets + "[.] "," \\1<prd> ", text)
    text = re.sub(acronyms+" "+starters,"\\1<stop> \\2", text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>\\3<prd>", text)
    text = re.sub(alphabets + "[.]" + alphabets + "[.]","\\1<prd>\\2<prd>", text)
    text = re.sub(" "+suffixes+"[.] "+starters," \\1<stop> \\2", text)
    text = re.sub(" "+suffixes+"[.]"," \\1<prd>", text)
    text = re.sub(" " + alphabets + "[.]", " \\1<prd>", text)
    if "”" in text: text = text.replace(".”", "”.")
    if "\"" in text: text = text.replace(".\"", "\".")
    if "!" in text: text = text.replace("!\"", "\"!")
    if "?" in text: text = text.replace("?\"", "\"?")
    text = text.replace(".", ".<stop>")
    text = text.replace("?", "?<stop>")
    text = text.replace("!", "!<stop>")
    text = text.replace("<prd>", ".")
    sentences = text.split("<stop>")
    sentences = [s.strip() for s in sentences]
    if sentences and not sentences[-1]: sentences = sentences[:-1]
    return sentences


def evaluate_whether_question(obj, attribute):
    text = get_basic_attribute(obj, attribute)
    _hit_words = ["help me",
    "need help",
    "any help",
    "please help",
    "get some help",
    "please advise",
    "advice",
    "recommendations",
    "can i ",
    "should i ",
    "do i ",
    "anyone know",
    "does it make sense"
    ]
    text = text.lower()
    # remove xpost content from titles
    text = re.sub("xpost from r/.*?($|\s)", "", text)
    text = re.sub("[\(\[].*?[\)\]]", "", text).strip()
    if len(text) == 0 or len(text.split()) < 4:
        return False
    if text[-1] == "?" or any(word in text for word in _hit_words):
        return True
    if attribute == "selftext":
        sentences = split_into_sentences(text)
        if len(sentences) > 3:
            if sentences[-2][-1] == "?" or sentences[-3][-1] == "?":
                return True
    return False


def filter_question(obj, attributes):
    if subreddit == "explainlikeimfive":
        return False
    cur_id = "t3_" + get_basic_attribute(obj, "id")
    if subreddit in good_question_ids and cur_id in good_question_ids[subreddit]:
        return False
    for attribute in attributes:
        if evaluate_whether_question(obj, attribute):
            return False
    return True


def filter_object(obj, comment_as_submission=False):
    min_score = 3
    threshold = 0.75
    if comment_as_submission == True:
        if filter_score(obj, perc=percentiles_dict["score"], min_score=min_score):
                return True, "Score too low"
        if filter_content(obj, comment_as_submission=True):
            return True, "part of content is [deleted] or [removed] or completely empty"
        if filter_author_flair(obj):
            return True, "Author flair is Moderator/Creator"
        if filter_bot_author(obj):
            return True, "Submission was written by bot"
        if filter_question(obj, ["body"]):
            return True, "Submission does not contain a question"

    elif submission_string == "submissions":
        if percentile != 0:
            if filter_score(obj, perc=percentiles_dict["score"], min_score=min_score):
                return True, "Score too low"
            if filter_upvote_ratio(obj, perc=percentiles_dict["upvote_ratio"], threshold=threshold):
                return True, "Upvote ratio too low"
        if filter_num_comments(obj, min_comments=3):
            return True, "Number of comments too low"
        if filter_domain(obj):
            return True, "Domain is self"
        if filter_author_flair(obj):
            return True, "Author flair is Moderator/Creator"
        if filter_content(obj):
            return True, "part of content is [deleted] or [removed] or completely empty"
        if filter_question(obj, ["title", "selftext"]):
            return True, "Submission does not contain a question"
        if filter_stickied(obj):
            return True, "Stickied"
        if filter_bot_author(obj):
            return True, "Submission was written by bot"
        if filter_link_flair_text(obj):
            return True, "Submission has irrelevant link flair"
        if filter_distinguished(obj):
            return True, "Submission is likely mod content"

    elif submission_string == "comments":
        # Uncomment this in case you want to prefilter comments based on their score and upvote ratio
        # if percentile != 0:
        #     if filter_score(obj, perc=min_score, min_score=min_score):
        #         return True, "Score too low"
        #     if filter_upvote_ratio(obj, perc=threshold, threshold=threshold):
        #         return True, "Upvote ratio too low"
        if filter_body(obj, perc=percentiles_dict["body"]):
            return True, "Body is [deleted] or [removed] or too short"
        if filter_collapsed(obj):
            return True, "Comment is collapsed"
        if filter_bot_author(obj):
            return True, "Comment was written by bot"
        if curated_or_not == "gildings_and_awardings":
            if filter_gilded(obj, percentiles_dict["gilded"], min_gildings=2):
                return True, "Comment wasn't gilded enough"
            if filter_awardings(obj, percentiles_dict["total_awards_received"], min_awards=2):
                return True, "Comment wasn't awarded enough"

        return False, "Not implemented yet"

    return False, "Not filtered"


def collect_parent_information(obj, parent_ids_kept, comment_is_submission=False):
    attributes_to_collect = ["num_comments", "title", "selftext", "score", "upvote_ratio", "ups", "downs", "author",
                             "created_utc", "retrieved_on", "retrieved_utc"]
    parent_obj = {}
    for attribute in attributes_to_collect:
        current_attribute_value = get_basic_attribute(obj, attribute)
        parent_obj[attribute] = current_attribute_value
    current_id = get_basic_attribute(obj, "name")
    if current_id is None:
        # id is only the unique identifier, the prefix t3_ signifies it being a submission
        # (needed to match to link_id/parent_id in comments)
        if comment_is_submission:
            current_id = "t1_" + obj["id"]
        else:
            current_id = "t3_" + obj["id"]
    parent_obj["id"] = current_id

    if comment_is_submission:
        body = get_basic_attribute(obj, "body")
        parent_obj["selftext"] = body
        parent_obj["title"] = ""
        parent_obj["num_comments"] = 10000 # arbitrary high number instead of None since we don't know the actual amount of subcomments in advance

    parent_obj["comment_is_submission"] = comment_is_submission
    parent_ids_kept[current_id] = parent_obj
    return parent_ids_kept


def collect_comment_information(obj):
    attributes_to_collect = ["body", "score", "upvote_ratio", "ups", "downs", "author", "parent_id", "name", "id",
                             "created_utc", "retrieved_on", "retrieved_utc"]
    comment_obj = {}
    for attribute in attributes_to_collect:
        current_attribute_value = get_basic_attribute(obj, attribute)
        comment_obj[attribute] = current_attribute_value
    return comment_obj


def iterate_over_file(file_path, stats, submission=True):
    file_size = os.stat(file_path).st_size
    file_lines = 0
    created = None
    bad_lines = 0
    filtered = 0
    # For Filtering
    output_path = os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", f"{subreddit}_filtered_{submission_string}.zst")
    handle = zstandard.ZstdCompressor().stream_writer(open(output_path, 'wb'))
    parent_ids_kept = {}
    if not submission:
        output_path_combined = os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", f"{subreddit}_filtered_combined.zst")
        handle_combined = zstandard.ZstdCompressor().stream_writer(open(output_path_combined, 'wb'))
        parents = json.load(open(os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", f"{subreddit}_parent_ids_kept.json")))
    for line, file_bytes_processed in read_lines_zst(file_path):
        file_lines += 1
        if file_lines % 100000 == 0:
            print(
                f"{created} Line: {file_lines:,} Bad Lines: {bad_lines:,} Bytes Processed: {file_bytes_processed:,} : {(file_bytes_processed / file_size) * 100:.0f}%")
        try:
            obj = json.loads(line)
            created = datetime.utcfromtimestamp(int(obj["created_utc"])).strftime("%Y/%m/%d")

            if not submission:
                # Additional Handling for comments_as_submissions
                current_link_id = obj["link_id"]
                current_parent_id = obj["parent_id"]
                if subreddit in comment_as_submission_ids and current_parent_id in comment_as_submission_ids[subreddit]:
                    filter_obj, reason = filter_object(obj, comment_as_submission=True)
                    if filter_obj:
                        filtered += 1
                        continue
                    parents = collect_parent_information(obj, parents, comment_is_submission=True)
                else:
                    if current_link_id in parents:
                        current_id = current_link_id
                    elif current_parent_id in parents:
                        current_id = current_parent_id
                    else:
                        filtered += 1
                        continue
                    parents[current_id]['num_comments'] -= 1
                    if 'comments' not in parents[current_id]:
                        parents[current_id]['comments'] = []
                    filter_obj, reason = filter_object(obj)
                    if filter_obj:
                        if parents[current_id]['num_comments'] == 0:
                            parents[current_id]['num_comments'] = len(parents[current_id]['comments'])
                            write_line_zst(handle_combined, json.dumps(parents[current_id]))
                            del parents[current_id]
                        filtered += 1
                        continue
                    current_comment = collect_comment_information(obj)
                    parents[current_id]['comments'].append(current_comment)
                    if parents[current_id]['num_comments'] == 0:
                        parents[current_id]['num_comments'] = len(parents[current_id]['comments'])
                        write_line_zst(handle_combined, json.dumps(parents[current_id]))
                        del parents[current_id]
            if submission:
                current_id = get_basic_attribute(obj, "name")
                if current_id is None:
                    current_id = "t3_" + obj["id"]
                if subreddit in comment_as_submission_ids and current_id in comment_as_submission_ids[subreddit]:
                    filtered += 1
                    continue
                filter_obj, reason = filter_object(obj)
                if filter_obj:
                    filtered += 1
                    continue
                parent_ids_kept = collect_parent_information(obj, parent_ids_kept)
            write_line_zst(handle, line)

        except (KeyError, json.JSONDecodeError) as err:
            print("Error:" + err)
    if subreddit not in stats:
        stats[subreddit] = {
            "filtered": filtered,
            "total": file_lines,
            "kept": file_lines - filtered,
            "percentage_kept": round((file_lines - filtered) / file_lines * 100, 2),
        }
    print(f"Filtered out {filtered} of {file_lines} {submission_string}")
    print(f"Wrote {file_lines - filtered} {submission_string} to {output_path}")
    if not submission:
        print(f"{len(parents)} submissions not matched with all their comments")
        temp_parents = copy.deepcopy(parents)
        for parent_id in temp_parents:
            if "comments" in temp_parents[parent_id] and len(temp_parents[parent_id]['comments']) > 0:
                temp_parents[parent_id]['num_comments'] = len(temp_parents[parent_id]['comments'])
                write_line_zst(handle_combined, json.dumps(temp_parents[parent_id]))
                del parents[parent_id]
        handle_combined.close()
        print(f"{len(parents)} submissions still not matched with comments")
        with open(os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", f"{subreddit}_parent_ids_kept_after_merging.json"),
                  "w") as file_handle:
            json.dump(parents, file_handle)
    if submission:
        with open(os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", f"{subreddit}_parent_ids_kept.json"),
                  "w") as file_handle:
            json.dump(parent_ids_kept, file_handle)
    handle.close()


def create_filtering_directory():
    cur_path = os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}")
    os.makedirs(cur_path, exist_ok=True)
    print("created directory: " + cur_path)


def get_percentiles():
    percentile_scores = {}
    if submission_string == "submissions":
        attribute_list = ["score", "upvote_ratio", "num_comments"]
    else:
        attribute_list = ["score", "upvote_ratio", "body", "gilded", "total_awards_received"]
    for attr in attribute_list:
        with open(os.path.join(PathManager.get_analysis_path(), subreddit, f"{submission_string}_{attr}_occurrences.json"),
                  "r") as f:
            unique_attribute_values = json.load(f)
        percentile_filter = percentile
        # if attr == "num_comments":
        #     percentile_filter = 0
        if attr == "body":
            unique_attribute_values[attr] = [body for body in unique_attribute_values[attr] if body is not None]
            unique_attribute_values[attr] = [len(body.split()) for body in unique_attribute_values[attr]]
            # percentile_filter = 50
        # elif attr == "gilded":
        #     percentile_filter = 0
        # elif attr == "total_awards_received":
        #     percentile_filter = 0
        ar = np.array(unique_attribute_values[attr])
        ar = ar[ar != np.array(None)]
        if len(ar) == 0:
            cur_percentile = 0.0
        else:
            cur_percentile = np.percentile(ar, percentile_filter)
        percentile_scores[attr] = cur_percentile
    return percentile_scores


if __name__ == "__main__":
    handle_all = True
    consider_gilding_and_awarding = "gildings_and_awardings"
    subreddits = ["personalfinance", "financialindependence", "FinancialPlanning", "investing", "wallstreetbets",
                  "Wallstreetbetsnew", "stocks", "StockMarket", "pennystocks", "options", "RealEstate", "Economics",
                  "realestateinvesting", "AskEconomics", "explainlikeimfive"]
    submission_stats = {}
    comment_stats = {}
    good_question_files = os.listdir(os.path.join(PathManager.get_special_submission_ids_path(), "good_questions"))
    good_question_ids = {}
    for el in good_question_files:
        subreddit_name = el.split(".")[0]
        with open(os.path.join(PathManager.get_special_submission_ids_path(), "good_questions", el), "r") as id_input_file:
            cur_ids = json.load(id_input_file)
        good_question_ids[subreddit_name] = cur_ids
    
    comment_as_submission_files = os.listdir(os.path.join(PathManager.get_special_submission_ids_path(), "comment_as_submission"))
    comment_as_submission_ids = {}
    for el in comment_as_submission_files:
        subreddit_name = el.split(".")[0]
        with open(os.path.join(PathManager.get_special_submission_ids_path(), "comment_as_submission", el), "r") as id_input_file:
            cur_ids = json.load(id_input_file)
        comment_as_submission_ids[subreddit_name] = cur_ids
    if not handle_all:
        subreddit = "personalfinance"
        submission_file = f"{subreddit}_submissions.zst"
        comment_file = f"{subreddit}_comments.zst"
        submission_string = "submissions"
        submission_path = os.path.join(PathManager.get_data_path(), submission_file)

        comment_path = os.path.join(PathManager.get_data_path(), comment_file)

        iterate_over_file(submission_path, submission_stats, submission=True)
        for key, value in submission_stats[subreddit].items():
            print(f"{key}: {value}")
        submission_string = "comments"
        iterate_over_file(comment_path, comment_stats, submission=False)
        for key, value in comment_stats[subreddit].items():
            print(f"{key}: {value}")
    if handle_all:
        submission_or_comment = [True, False]
        consider_gilding_and_awarding = ["gildings_and_awardings", "no_gilding_and_awards"]
        percentiles = [0, 80, 90, 95, 98, 99]
        # Calculate all required percentiles for submissions and comments of all considered subreddits
        if os.path.exists("./percentiles.json"):
            with open("percentiles.json", "r") as percentiles_handle:
                percentiles_dict_overall = json.load(percentiles_handle)
        else:
            percentiles_dict_overall = {}
            for percentile in percentiles:
                if percentile not in percentiles_dict_overall:
                    percentiles_dict_overall[percentile] = {}
                for subreddit in subreddits:
                    if subreddit not in percentiles_dict_overall[percentile]:
                        percentiles_dict_overall[percentile][subreddit] = {}
                    for is_submission in submission_or_comment:
                        if is_submission:
                            submission_string = "submissions"
                        else:
                            submission_string = "comments"
                        if submission_string not in percentiles_dict_overall[percentile][subreddit]:
                            percentiles_dict_overall[percentile][subreddit][submission_string] = {}
                        percentiles_dict_overall[percentile][subreddit][submission_string] = get_percentiles()
                        print(f"Added percentiles {percentile} for subreddit {subreddit} {submission_string}")
            with open("percentiles.json", "w") as percentiles_handle:
                json.dump(percentiles_dict_overall, percentiles_handle)
        for curated_or_not in consider_gilding_and_awarding:
            for percentile in percentiles:
                create_filtering_directory()
                print(f"Handling {curated_or_not}_percentile{percentile}")
                submission_stats = {}
                comment_stats = {}
                for subreddit in subreddits:
                    print(f"Handling subreddit {subreddit}")
                    for is_submission in submission_or_comment:
                        if is_submission:
                            submission_string = "submissions"
                        else:
                            submission_string = "comments"
                        percentiles_dict = percentiles_dict_overall[str(percentile)][subreddit][submission_string]
                        file_name = f"{subreddit}_{submission_string}.zst"
                        path = os.path.join(PathManager.get_data_path(), file_name)
                        if is_submission:
                            iterate_over_file(path, submission_stats, submission=is_submission)
                        else:
                            iterate_over_file(path, comment_stats, submission=is_submission)
                with open(os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", "submissions_filtered_stats.json"),
                          "w") as stats_handle:
                    json.dump(submission_stats, stats_handle)
                with open(os.path.join(PathManager.get_filtered_all_comments_path(), f"{curated_or_not}_percentile{percentile}", "comments_filtered_stats.json"),
                          "w") as stats_handle:
                    json.dump(comment_stats, stats_handle)
