import random
import json
import pandas as pd

random.seed(0) 
sample_indices = random.sample(range(0,500), 200)

# file_list = [
#     "llama_chat_dpo_rep_1_1.json",
#     "llama_chat_rep_1_2.json",
#     "llama_chat_sft_dpo_rep_1_2.json",
#     "llama_chat_sft_rep_1_1.json",
#     "llama_rep_1_1.json",
#     "llama_sft_dpo_rep_1_2.json",
#     "llama_sft_rep_1_3.json",
#     "mistral_rep_1_3.json",
#     "mistral_sft_dpo_rep_1_2.json",
#     "mistral_sft_rep_1_2.json",
#     "zephyr_dpo_rep_1_2.json",
#     "zephyr_rep_1_3.json",
#     "zephyr_sft_dpo_rep_1_3.json",
#     "zephyr_sft_rep_1_3.json",
# ]

file_list = [
    "llama_chat_dpo_rep_1_2.json",
    "llama_chat_rep_1_2.json",
    "llama_chat_sft_dpo_rep_1_2.json",
    "llama_chat_sft_rep_1_2.json",
    "llama_rep_1_2.json",
    "llama_sft_dpo_rep_1_2.json",
    "llama_sft_rep_1_2.json",
    "mistral_rep_1_2.json",
    "mistral_sft_dpo_rep_1_2.json",
    "mistral_sft_rep_1_2.json",
    "zephyr_dpo_rep_1_2.json",
    "zephyr_rep_1_2.json",
    "zephyr_sft_dpo_rep_1_2.json",
    "zephyr_sft_rep_1_2.json",
]
overall_df = None
for file in file_list:
    with open(f"eval/final_data/{file}", "r") as inputfile:
        cur_dataset = json.load(inputfile)
    for el in cur_dataset:
        el["questions"] = el["questions"].removeprefix("[INST] ")
        el["questions"] = el["questions"].removesuffix(" [/INST]")
        el["questions"] = el["questions"].removeprefix("<|system|>\n\n<|user|>\n")
        el["questions"] = el["questions"].removeprefix("<|system|>\n</s>\n<|user|>\n")
        el["questions"] = el["questions"].removesuffix("\n<|assistant|>\n")
        el["questions"] = el["questions"].removesuffix("</s>\n")
    cur_dataset_df = pd.DataFrame(cur_dataset)
    if overall_df is None:
        overall_df = cur_dataset_df.iloc[sample_indices,:]
    else:
        overall_df = pd.concat([overall_df, cur_dataset_df.iloc[sample_indices,:]])
overall_df = overall_df.sample(frac=1)
overall_df.sort_index(inplace=True)
overall_df.to_csv("human_eval_test_dataset.csv", sep=";", decimal=",")
    
