import sys
import os

print(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))

from src.util.load_config import get_config, print_config
from datasets import load_dataset, DatasetDict

# get the configuration
config = get_config()
print_config(config)

# do the filtering
data = load_dataset(
    config["working_organization"] + "/" + config["subset_name"], split="train"
)

train_test_split = data.train_test_split(test_size=config["test_set_size"])

split_dataset = DatasetDict(
    {"train": train_test_split["train"], "test": train_test_split["test"]}
)

split_dataset.push_to_hub(
    config["working_organization"]
    + "/"
    + config["subset_name"]
    + f"-{int(config['test_set_size'] * 100)}-split"
)
