import sys
import os

DIRPATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
print(DIRPATH)
sys.path.append(DIRPATH)

from src.util.load_config import get_config, print_config
from src.util.load_prompt import read_text_file
from datasets import load_dataset
from transformers import AutoTokenizer

config = get_config()
print_config(config)

PROMPT_ROOT = DIRPATH + "/data/prompts/"
SYS_PROMPT = PROMPT_ROOT + "system_prompts/generate_email_from_intent.txt"
EMAIL_FORMAT = PROMPT_ROOT + "data_formats/metadata_with_intent.txt"

dataset_name = (
    config["working_organization"]
    + "/"
    + config["subset_name"]
    + f"-{int(config['test_set_size'] * 100)}-split"
)
tokenizer = AutoTokenizer.from_pretrained(config["small_model"])

messages = [{"role": "system", "content": read_text_file(SYS_PROMPT)}]

email_format = read_text_file(EMAIL_FORMAT)


def make_prompts_for_data(data):
    prompts = []
    for row in data:
        fr = row["from"]
        to = row["to"]
        date = row["date"]
        subject = row["subject"]
        previous_context = row["previous_context"]
        intent = row["user_intent"]
        to_generate = row["email"]

        user_content = email_format.format(
            fr, to, date, subject, previous_context, intent
        )
        prompt = messages + [
            {
                "role": "user",
                "content": user_content,
            },
            {"role": "assistant", "content": to_generate},
        ]
        prompt = tokenizer.apply_chat_template(
            prompt, tokenize=False, add_generation_prompt=False
        )
        prompts.append(prompt)
    return prompts


def process_for_dataset_split(df, column_name="text"):
    print("Generating Prompts...")
    prompts = make_prompts_for_data(df)
    df = df.add_column(column_name, prompts)
    return df


data = load_dataset(dataset_name)
data["train"] = process_for_dataset_split(data["train"])
data["test"] = process_for_dataset_split(data["test"])

data = data.remove_columns(
    [
        "from",
        "to",
        "date",
        "subject",
        "previous_context",
        "user_intent",
        "email",
        "baseline_generation",
        "generated_rules",
        "gold_rule_email",
    ]
)


data.push_to_hub(dataset_name + "-naiveft-training-data")
