""" utility modules related to dataset """
import re
import pandas as pd

VALID_DATA_TYPE = ['news', 'safeguard', 'comments']
__all__ = ('get_data', 'VALID_DATA_TYPE')


def get_data(data_type: str = 'news', data_dir: str = './data'):
    """ to get a dataset given a dataset type

    :param data_type: one from VALID_DATA_TYPE
    :return: sentences (a list of string texts), labels (a list of string label)
    """
    assert data_type in VALID_DATA_TYPE
    df = pd.read_csv('{}/trainingDataSubsets.csv'.format(data_dir), index_col=0)
    sentences = df['clData'][df.dataset == data_type].values.tolist()

    def quick_cleaning(_sent):
        _sent = _sent.replace('\n', '').replace('-', '').replace('|', '').replace('>', '').replace(':', '')
        _sent = re.sub(r'\s+', ' ', _sent)
        _sent = re.sub(r'\A\s*', '', _sent)
        _sent = re.sub(r'\s*\Z', '', _sent)
        _sent = _sent.lower()
        return _sent

    if data_type == 'news':
        sentences = [quick_cleaning(s) for s in sentences]

    # other meta data
    meta_dict = {
        'overallLabel': df['overallLabel'][df.dataset == data_type].values.tolist(),
        'id': df['id'][df.dataset == data_type].values.tolist(),
        'data': df['data'][df.dataset == data_type].values.tolist(),
        'dataset': df['dataset'][df.dataset == data_type].values.tolist(),
        'clData': sentences
    }

    return sentences, meta_dict
