import sys
import os
import re

print(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))

from datasets import load_dataset

DATASET = "preference-agents/preference-enron"
MODEL = "meta-llama/Meta-Llama-3-70B-Instruct"
MODEL_NAME = MODEL.split("/")[-1]


def parse_intent(text):
    # Check for <core_content> tag
    rules_start = re.search(r"<rules>", text)
    if rules_start:
        # Remove everything up until the start of <rules> tag
        # text = re.sub(r".*?<rules>", "", text)
        text = text[rules_start.start() + 7 :]
    else:
        # Check for </thinking> tag
        thinking_end = re.search(r"</thinking>", text)
        if thinking_end:
            # Remove everything up until the end of </thinking> tag
            # text = re.sub(r".*?</thinking>", "", text)
            text = text[thinking_end.end() + 11 :]

    # Remove the ending </rules> tag, if it exists
    text = re.sub(r"</rules>", "", text)
    # strip the text
    text = text.strip()
    return text


def clean_rules(row):
    cleaned_rules = parse_intent(row[f"generated_no_baseline_rules_{MODEL_NAME}"])
    return {f"cleaned_no_baseline_rules_{MODEL_NAME}": cleaned_rules}


# load the data
dataset = load_dataset(DATASET, split="train")
dataset = dataset.map(clean_rules)
dataset.push_to_hub(DATASET)
