import sys
import os
import re

DIRPATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
print(DIRPATH)
sys.path.append(DIRPATH)

from datasets import load_dataset
from src.util import get_config, print_config

config = get_config()
print_config(config)

DATASET = config["dataset"]
COLUMN_NAME = "small_model_baseline"


def parse_intent(text):
    # Remove any line that starts with 'Here' and ends with ':'
    text = re.sub(r"^Here.*?:", "", text, flags=re.MULTILINE)
    # Strip the text to remove leading and trailing whitespaces
    text = text.strip()
    return text


def clean_intent(row):
    cleaned_intent = parse_intent(row[COLUMN_NAME])
    return {COLUMN_NAME: cleaned_intent}


dataset = load_dataset(DATASET, split="train")
# dataset.push_to_hub(DATASET + "_original_generated")
dataset = dataset.map(clean_intent)
dataset.push_to_hub(DATASET)
