import random
import json
import pandas as pd

random.seed(0) 
sample_indices = random.sample(range(0,500), 50)
sample_indices.sort()
file_list = [
    "llama_chat_dpo_rep_1_1.json",
    "llama_chat_rep_1_2.json",
    "llama_chat_sft_dpo_rep_1_2.json",
    "llama_chat_sft_rep_1_1.json",
    "llama_rep_1_1.json",
    "llama_sft_dpo_rep_1_2.json",
    "llama_sft_rep_1_3.json",
    "mistral_rep_1_3.json",
    "mistral_sft_dpo_rep_1_2.json",
    "mistral_sft_rep_1_2.json",
    "zephyr_dpo_rep_1_2.json",
    "zephyr_rep_1_3.json",
    "zephyr_sft_dpo_rep_1_3.json",
    "zephyr_sft_rep_1_3.json",
]
overall_df = None
dataset_to_add = []
for idx, file in enumerate(file_list):
    with open(f"eval/final_data/{file}", "r") as inputfile:
        cur_dataset = json.load(inputfile)
    file_name = file.split("_rep")[0]
    for dataset_idx, el in enumerate(cur_dataset):
        if dataset_idx not in sample_indices:
            continue
        el["questions"] = el["questions"].removeprefix("[INST] ")
        el["questions"] = el["questions"].removesuffix("[\INST] ")
        el["questions"] = el["questions"].removeprefix("<|system|>\n\n<|user|>\n")
        el["questions"] = el["questions"].removesuffix("\n<|assistant|>\n")
        if idx == 0:
            new_obj = {
                "questions": el["questions"],
                "reference_1": el["reference_1"],
                "reference_2": el["reference_2"],
                file_name: el["prediction"] 
            }
            dataset_to_add.append(new_obj)
        else:
            new_obj = {
                file_name: el["prediction"]
            }
            dataset_to_add[sample_indices.index(dataset_idx)].update(new_obj)
overall_df = pd.DataFrame(dataset_to_add)
overall_df.to_csv("qualitative_analysis.csv", sep=";", decimal=",")
    
