import json
import sys
import re
pronoun_list = [
    "mine",
    "us",
    "you",
    "him",
    "whoever",
    "whomever",
    "themselves",
    "there",
    "your",
    "these",
    "where",
    "myself",
    "whose",
    "someone",
    "ourselves",
    "his",
    "whichever",
    "everybody",
    "yourselves",
    "anybody",
    "which",
    "our",
    "herself",
    "ours",
    "ourselves",
    "its",
    "my",
    "hers",
    "their",
    "her",
    "whosever",
    "whom",
    "yourself",
    "both",
    "she",
    "me",
    "himself",
    "itself",
    "I",
    "theirs",
    "those",
    "we",
    "he",
    "them",
    "who",
    "they",
    "somebody",
    "each other",
    "something",
    "it",
    "yours",
    "that",
    "others",
    "neither",
    "none",
    "wherever",
    "some",
    "thyself",
    "no one",
    "whereon",
    "thy",
    "whence",
    "whereof",
    "ye",
    "theirself",
    "whatever",
    "whatnot",
    "whether",
    "thee",
    "whosesoever",
    "anyone",
    "several",
    "many",
    "whereunto",
    "ourself",
    "thine",
    "anything",
    "such",
    "any",
    "all",
    "aught",
    "nobody",
    "somewhat",
    "either",
    "whoso",
    "themself",
    "suchlike",
    "whomsoever",
    "whichsoever",
    "wherewith",
    "everything",
    "idem",
    "nothing",
    "one another",
    "this",
    "wheresoever",
    "as",
    "most",
    "whosoever",
    "another",
    "naught",
    "thou",
    "whereto",
    "nought",
    "one",
    "other",
    "theirselves",
    "whatsoever",
    "yon",
    "whereby",
    "whomso",
    "everyone",
    "enough",
    "few",
    "wherefrom",
    "wherein",
    "whereinto",
    "wherewithal",
    "yonder",
    "what",
    "ought",
    "each",
]


if __name__ == "__main__":
    input = sys.argv[1]
    with open(input, "r") as f_in:
        input_data = json.load(f_in)

    f_ques = open("questions-"+input[:-5]+".json", "w")
    ques_refs = {}
    # f_ctx = open("contexts-"+input[:-5]+".txt", "w")

    filterd_data = {"version": input_data["version"]}
    filterd_data["data"] = []

    for paraphs in input_data["data"]:
        filterd_paraphs = {"title": paraphs["title"], "paragraphs": []}
        for context_info in paraphs["paragraphs"]:
             # context = context_info["context"]
             # f_ctx.write(context_info["context"]+"\n")
             filtered_qas = []
             for qa in context_info["qas"]:
                 # print(qa["id"])
                 
                 ref_qa = re.findall(r"<ref>.*</ref>", qa["question"])
                 if not ref_qa:
                     continue
                 else:
                    ref_qa = ref_qa[0][6:-7]
                 
                 question_orig = qa["question"]
                 # only add valid questions (contains pronouns) to the questions file.
                 for substr in ["<ref>", "</ref>", "-LSB- ", " -LSB-", "-LSB-", "-RSB- ", " -RSB", "-RSB-"]:
                     question_orig = question_orig.replace(substr, "")
                 
                 ref_in_question = re.findall("<ref>.*</ref>", qa["question"])[0][6:-7] 
                 question_ref = {"question": question_orig, "ref": ref_in_question}

                 # convert the question with <ref> to a cloze question, not helpful.
                 # qa["question"] = re.sub("<ref>.*</ref>", " __ ", qa["question"]) 

                 answers = [ans["text"] for ans in qa["answers"]]
                 # pronoun in the answers, keep the example
                 if any(ans in pronoun_list for ans in answers) or ref_qa in pronoun_list:
                     filtered_qas.append(qa)
                     # f_ques.write(question_orig+"\n")
                     # json.dump(question_refs, f_ques, indent=4)
                     ques_refs[qa["id"]]  = question_ref
                 elif len(answers) == 1 and answers[0] == ref_qa:
                     continue
                 else:
                     filtered_qas.append(qa)
                     # f_ques.write(question_orig+"\n")
                     # json.dump(question_refs, f_ques, indent=4)
                     ques_refs[qa["id"]]  = question_ref

             if filtered_qas:
                 filtered_context = context_info["context"]
                 filterd_paraphs["paragraphs"].append({"context": context_info["context"], "qas": filtered_qas})
        if filterd_paraphs["paragraphs"]:
            filterd_data["data"].append(filterd_paraphs)

    with open(input[:-5]+"-filterd.json", "w") as f_w:
        json.dump(filterd_data, f_w, indent=4)
    
    json.dump(ques_refs, f_ques, indent=4)
    f_ques.close()
    # f_ctx.close()





