import sys
import os

DIRPATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
print(DIRPATH)
sys.path.append(DIRPATH)

import re
from datasets import load_dataset
from src.util.load_config import get_config, print_config

config = get_config()
print_config(config)

columns_evaluated = [
    "baseline_generation",
    # "gold_rule_email",
    "naiveft_baseline",
    # "rulegen_rules_generated_email",
]
dataset_name = (
    config["working_organization"]
    + "/"
    + config["subset_name"]
    + f"-{int(config['test_set_size'] * 100)}-split"
)
msize = "8b"
column_with_eval = "{}_llmeval_{}_raw"
pattern = r"Similarity Rating:\s*(\d+)"


def clean_text(ls):
    results = []
    for each in ls:
        match = re.search(pattern, each)
        if match:
            results.append(int(match.group(1)))
        else:
            results.append(None)
    return results


def process_for_dataset_split(df):
    for evaluation_candidate in columns_evaluated:
        raw_name = column_with_eval.format(evaluation_candidate, msize)
        outputs = clean_text(df[raw_name])
        column_name = f"{evaluation_candidate}_llmeval_{msize}"
        print(f"Adding column {column_name}")
        if column_name in df.column_names:
            df = df.remove_columns([column_name])
        df = df.add_column(column_name, outputs)
    return df


dataset = load_dataset(dataset_name)
dataset["train"] = process_for_dataset_split(dataset["train"])
dataset["test"] = process_for_dataset_split(dataset["test"])
dataset.push_to_hub(dataset_name)
