import os
from re import S
import numpy as np
from os import listdir
import torch
from torch.utils.data import Dataset

def load_the_dataset(args, tokenizer, filter_lang, mode, logger):
    language_data_pair = {}
    languages = [ lang.lower() for lang in listdir(args.input_dir)]
    for language in languages:
        if language in filter_lang:
            files = [os.path.join(args.input_dir, language, 'test.source'), os.path.join(args.input_dir, language, 'test.target')]
            prepared_data = PrepareDataset(args, files, tokenizer, mode)
            logger.info("Processed %s language and Size is: %d" %(language, len(prepared_data)))
            if language not in language_data_pair:
               language_data_pair[language] = prepared_data
            else:
               logger.info("Duplicated Language found")
               return
    return language_data_pair

