import pandas as pd
import os
from sklearn.metrics import cohen_kappa_score
import json


gemini_folder = "predictions_new_template_500_fixed_50_selected"
human_folder = "predictions_human"
human_files = os.listdir(human_folder)


def preprocess_data(df):
    metrics = ["Relevance", "Specificity", "Simplicity", "Helpfulness", "Objectivity"]
    for metric in metrics:
        print("After")
        # print(df[metric].unique())
        df[metric].fillna(1, inplace=True)
        df[metric] = df[metric].astype(str)
        df.loc[df[metric].str.contains(" "), metric] = 1
        df.loc[df[metric] == '0', metric] = 1
        df.loc[df[metric] == '1', metric] = 1
        df.loc[df[metric] == '2', metric] = 2
        df.loc[df[metric] == '3', metric] = 3
        df.loc[df[metric] == '4', metric] = 4
        df.loc[df[metric] == '5', metric] = 5
        df.loc[df[metric] == '0.0', metric] = 1
        df.loc[df[metric] == '1.0', metric] = 1
        df.loc[df[metric] == '2.0', metric] = 2
        df.loc[df[metric] == '3.0', metric] = 3
        df.loc[df[metric] == '4.0', metric] = 4
        df.loc[df[metric] == '5.0', metric] = 5
        print(metric)
        print(df[metric].unique())
    del df["prediction"]

for file in human_files:
    gemini_ratings = pd.read_csv(f"{gemini_folder}/{file}")
    human_ratings = pd.read_csv(f"{human_folder}/{file}")
    print(f"Handling file {file}")
    print("Prepocess gemini data...")
    preprocess_data(gemini_ratings)
    print("Prepocess human data...")
    preprocess_data(human_ratings)
    gemini_ratings = gemini_ratings.add_prefix("Gemini_")
    human_ratings = human_ratings.add_prefix("Human_")
    test = human_ratings["Human_Relevance"].tolist()
    merged_ratings = pd.concat([gemini_ratings, human_ratings], axis=1)
    gem_rate = merged_ratings["Gemini_Relevance"].tolist()
    hum_rate = merged_ratings["Human_Relevance"].tolist()
    correlation = merged_ratings.corr(method="pearson")
    new_file_name = file.split(".")[0]
    correlation = correlation.round(2)
    correlation.to_csv(f"correlations/{new_file_name}.csv")

    columns = ["Relevance", "Specificity", "Simplicity", "Helpfulness", "Objectivity"]
    results = {}
    for column in columns:
        cur_score = cohen_kappa_score(merged_ratings[f"Gemini_{column}"].tolist(), merged_ratings[f"Human_{column}"].tolist())       
        results[column] = round(cur_score, 2)
    with open(f"kappa_scores/{new_file_name}.json", "w") as f:
        json.dump(results, f)