"""
This script is used for generate adversary datasets. For example, given original SQuAD training set, generate a new
SQuAD-adv set in which questions only have one wh-word instead of a whole question sentence.
"""
import os
import gzip
import json
import sys
import shutil
from spacy.lang.en import English

WH_WORDS = ["what", "which", "who", "whom", "whose", "where", "why", "how", "when"]
HOW_QUESTIONS = ["how far", "how long", "how many", "how much", "how old"]

nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)

def get_question_type(question="", question_tokens=[]):
    qs_type = "others"
    if not question_tokens:
        question_tokens = [token.text for token in tokenizer(question.lower())]
    for key_word in WH_WORDS:
        if key_word in question_tokens:
            qs_type = key_word
            if key_word == "how":
                for how_word in HOW_QUESTIONS:
                    if how_word in question.lower():
                        qs_type = how_word
            break
    return qs_type

if __name__ == "__main__":
    datasets = ["quoref"]
    src_folder = sys.argv[1]
    out_folder = sys.argv[2]

    f_out = open(os.path.join(out_folder, "train-bart-wh-word.json"), "w")
    with open(os.path.join(src_folder, "train-bart.json")) as f_in:
        data = json.load(f_in)
        for paragraphs in data["data"]:
            for context in paragraphs["paragraphs"]:
                for qa in context["qas"]:
                    question = qa["question"]
                    question_tokens = [token.text for token in tokenizer(question.lower())]
                    q_type = get_question_type(question.lower(), question_tokens)
                    if q_type == "others":
                        # if there is no WH word in this sentence, keep the first tokens.
                        qa["question"] = question_tokens[0]
                    else:
                        qa["question"] = q_type
    json.dump(data, f_out, indent=4)
    f_out.close()
