import sys
import os
import re

print(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))

from datasets import load_dataset

DATASET = "preference-agents/preference-enron"
MODEL = "meta-llama/Meta-Llama-3-70B-Instruct"
MODEL_NAME = MODEL.split("/")[-1]


def parse_intent(text):
    text = re.sub(r"Here is the generated email:", "", text)
    text = re.sub(r"Here is a generated email:", "", text)
    text = re.sub(r"Here is an email that meets the specified intent:", "", text)
    text = re.sub(r"Here is an email that matches the metadata and intent:", "", text)
    text = re.sub(
        r"Here is an email generated based on the provided metadata, previous context, and intent:",
        "",
        text,
    )
    # strip the text
    text = text.strip()
    return text


def clean_rules(row):
    cleaned_rules = parse_intent(row[f"generated_baseline_{MODEL_NAME}"])
    return {f"cleaned_baseline_{MODEL_NAME}": cleaned_rules}


# load the data
dataset = load_dataset(DATASET, split="train")
dataset = dataset.map(clean_rules)
dataset.push_to_hub(DATASET)
