import os, glob
import re
import spacy

import sys

sys.path.insert(1, './identifier_normalizer')

import normalizer_main

path = 'DUTA_10K_Masked_Dataset'

try:
    os.mkdir(path)
except OSError:
    print("Dataset directory already created.")

sites_bydomain = glob.glob('classes/*.txt')

# selected encodings
encodings = ['utf-8', 'windows-1250']

# spacy NLP model
nlp = spacy.load('en_core_web_sm', exclude=["parser", "ner"])

for domain in sites_bydomain:

    current_domain = re.search('classes/(.+?).txt', domain).group(1)
    
    try:
        os.mkdir(os.path.join(path, current_domain))
    except OSError:
        print(f'{current_domain} domain already created.')

    print(f'Masking {current_domain} domain...')

    with open(domain, 'r') as sites:     

        onions = sites.readlines()
        for onion in onions:
            onion = onion.rstrip('\n')
            
            url = re.search('/(.+?)/', onion).group(1) + '.txt'

            content = None
            
            for e in encodings:
                try:
                    with open(onion, 'r', encoding=e) as f:
                        content = f.read()
                except UnicodeDecodeError:
                    pass
                    #print(f'Got unicode error with {e}, trying a different encoding')
                else:
                    # encoding works - preprocess with tokens
                    try: 
                        content_preproc = normalizer_main.preprocess(content, spacy_nlp=nlp)
                    except OverflowError:
                        pass
                    else:
                        content = content_preproc
                    break

            # make dataset
            if content is not None:
                fw = open(os.path.join(path, current_domain, url), 'w')
                fw.write(content)
                fw.close()

print('Masked Dataset Created')
