"""
Mainly to clean generated email scripts
"""
import sys
import os
import re
from datasets import load_dataset

DATASET = "preference-agents-experiments/kay-mann-20-split"
COLUMN_NAME = "rulegen_rules_generated_email"
CLEANED_COLUMN_NAME = "rulegen_rules_generated_email"

def parse_intent(text):
    text = re.sub(r"Here is the generated email:", "", text)
    text = re.sub(r"Here is a generated email:", "", text)
    text = re.sub(r"Here is an email that meets the specified intent:", "", text)
    text = re.sub(r"Here is an email that matches the metadata and intent:", "", text)
    text = re.sub(
        r"Here is an email based on the provided metadata, previous context, and intent:",
        "",
        text,
    )
    text = re.sub(
        r"Here is an email generated based on the provided metadata, previous context, and intent:",
        "",
        text,
    )
    text = re.sub(
        r"Here is an email that takes into account the metadata, previous context, and intent of the user:",
        "",
        text,
    )
    # strip the text
    text = text.strip()
    return text


def clean_rules(row):
    cleaned_rules = parse_intent(row[COLUMN_NAME])
    return {CLEANED_COLUMN_NAME: cleaned_rules}


# load the data
dataset = load_dataset(DATASET)
dataset = dataset.map(clean_rules)
dataset.push_to_hub(DATASET)
