import argparse
import json
import os

import PyPDF2
import pandas as pd
import requests
import requests.utils
import seaborn as sns

MAJOR_CONFERENCES_ABBREVIATION_DICT = {
    "Annual Meeting of the Association for Computational Linguistics": "ACL",
    "Conference on Empirical Methods in Natural Language Processing": "EMNLP",
    # "European Chapter of the Association for Computational Linguistics": "EACL",
    "North American Chapter of the Association for Computational Linguistics": "NAACL",
    # "International Joint Conference on Natural Language Processing": "IJCNLP",
    "International Conference on Computational Linguistics": "COLING",
    "Language Resources and Evaluation": "LREC",
    "Findings of the Association for Computational Linguistics: ACL": "ACL",
    "Findings of the Association for Computational Linguistics: EMNLP": "EMNLP",
}


def load_anthology(file_name):
    with open(file_name) as f:
        acl_anthology_data = json.load(f)
    return acl_anthology_data


def has_code(acl_entry_dict):
    return "Code" in acl_entry_dict.keys()


def newer_than(acl_entry_dict, year):
    return acl_entry_dict["year"] > year


def is_major_conference(acl_entry_dict):
    # ACL, EMNLP, EACL, NAACL, IJCNLP, COLING
    major_conferences_list = list(MAJOR_CONFERENCES_ABBREVIATION_DICT.keys())

    is_workshop = "booktitle" in acl_entry_dict and "workshop" in acl_entry_dict["booktitle"].lower()
    is_tutorial = "booktitle" in acl_entry_dict and "tutorial" in acl_entry_dict["booktitle"].lower()
    is_major_conference = any([conference in acl_entry_dict["booktitle"]
                               for conference in major_conferences_list
                               if "booktitle" in acl_entry_dict])
    # emnlp_title = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing"
    # r = acl_entry_dict["booktitle"] == emnlp_title
    return is_major_conference and not is_workshop and not is_tutorial


def preprocess_acl_data(acl_anthology_data):
    major_conferences_list = list(MAJOR_CONFERENCES_ABBREVIATION_DICT.keys())
    for acl_entry in acl_anthology_data:
        if "booktitle" in acl_entry:
            acl_entry["booktitle"] = acl_entry["booktitle"].replace("{", "").replace("}", "")

            conference_full_name = next(filter(lambda conference: conference in acl_entry["booktitle"],
                                               major_conferences_list),
                                        None)
            if conference_full_name is not None:
                acl_entry["conference"] = MAJOR_CONFERENCES_ABBREVIATION_DICT[conference_full_name]
        acl_entry["year"] = int(acl_entry["year"])


def process_emnlp_2021(acl_anthology):
    acl_booktitle = "Conference on Empirical Methods in Natural Language Processing"
    emnlp_papers = list(
        filter(
            lambda x: "booktitle" in x and
                      acl_booktitle in x["booktitle"] and x["year"] == 2021,
            acl_anthology))

    cache_dir = ".cache"
    if not os.path.isdir(cache_dir):
        os.mkdir(cache_dir)
    for candidate in emnlp_papers:
        try:
            cache_file_path = os.path.join(cache_dir, candidate["ID"])
            if os.path.isfile(cache_file_path):
                with open(cache_file_path, 'rb') as f:
                    pdf_content = f.read()
            else:
                pdf_url = requests.get(candidate["PDF"])
                if pdf_url.status_code == 200:
                    pdf_content = pdf_url.content
                    with open(cache_file_path, 'wb') as f:
                        f.write(pdf_content)
                else:
                    print("Failed {}".format(pdf_url))
                    continue

            with open(cache_file_path, 'rb') as f:
                pdf_reader = PyPDF2.PdfFileReader(f)

                candidate["text"] = "\n".join([page.extractText() for page in pdf_reader.pages])
        except Exception as e:
            print("Failed {} {}".format(candidate["ID"], candidate["PDF"]))
            continue
    emnlp_papers_without_code = list(
        filter(
            lambda x: "github.com" in candidate["text"],
            emnlp_papers))
    print("")
    # plot_conferences_code_submission_ratio_from_2018(acl_anthology, plot_dir)


def main():
    parser = argparse.ArgumentParser(description='Downloading reproducibility data for ACL Anthology')
    parser.add_argument("--anthology_json_path", type=str)
    parser.add_argument("--plot_dir", type=str)
    sns.set_theme()
    sns.set_style("darkgrid")

    args = parser.parse_args()
    anthology_json_path = args.anthology_json_path
    plot_dir = args.plot_dir

    acl_anthology = load_anthology(anthology_json_path)
    preprocess_acl_data(acl_anthology)

    process_emnlp_2021(acl_anthology)


if __name__ == '__main__':
    main()
