"""Utils for loading datasets from hub."""

import enum
from typing import Optional, Union, List, Tuple

import pandas as pd
from datasets import load_dataset


class DatasetsAvailable(enum.Enum):
    """Datasets available"""

    MultidomainGold = "Multidomain gold dataset."
    RUSpellRU = "Social media texts and blogs."
    MedSpellchecker = "Medical anamnesis."
    GitHubTypoCorpusRu = "Github commits."


datasets_available = [dataset.name for dataset in DatasetsAvailable]

# TODO: remove `use_auth_token` from `load_dataset` after public release


def load_available_dataset_from_hf(
        dataset_name: str, for_labeler: bool, split: Optional[str] = None
) -> Union[Tuple[List[str], List[str]], pd.DataFrame]:
    if dataset_name not in datasets_available:
        raise ValueError("You provided wrong dataset name: {}\nAvailable datasets are: {}".format(
            dataset_name, *datasets_available))
    # The original name of the repository is removed to maintain anonymity;
    # To use datasets you can still upload them from the local folders and
    # The datasets themselves are located in data-archive;
    dataset = load_dataset("<REPO_NAME>", dataset_name, split=split, use_auth_token=True)
    if split is None:
        dataset = pd.concat([dataset[split].to_pandas() for split in dataset.keys()]).reset_index(drop=True)
    else:
        dataset = dataset.to_pandas()
    if for_labeler:
        sources = dataset.source.values.tolist()
        corrections = dataset.correction.values.tolist()
        return sources, corrections
    return dataset
