'''
DUTA-10K dataset builder for learning networks

Pipeline:
1. Use sites only labeled as english
2. Remove words that are non-alphabetic
'''

import os
import csv
import glob

# Get all english data and separate into subgroups
def separate_data():
    path = "classes"
    
    try: 
        os.mkdir(path)
    except OSError:
        print("classes directory created.")
    else:
        print("directory creation fail - check if it already exists")

    # Create dictionary of subclasses (classes if subclasses do not exist)
    sites_dict = dict()
    seen_classes = [] 

    with open('DUTA_10K.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')

        header = next(csv_reader)
        for row in csv_reader:
            row = list(map(str.strip, row))
            class_name = '_'.join(row[2:4]).lower().replace(' ','-') if row[3] \
                    else row[2].lower().replace(' ','-')
            
            if class_name not in seen_classes:
                sites_dict[class_name] = list()
                seen_classes.append(class_name)
            
            if row[4] == 'en':
                sites_dict[class_name].append(row[1])

    # Create list of text files containing directories of corresponding onion urls
    data = glob.glob('DUTA_10K_Dataset/*/*')
    txt_data = [item for item in data if item.lower().endswith('txt')]
    
    for domain in seen_classes:
        pages = sites_dict[domain]
        dup_lines = set()
        
        fname = os.path.join(path, domain + '.txt')
        print(f'Creating text file for \'{domain}\' domain...')
       
        with open(fname, 'w') as f:
            for item in txt_data:
                for page in pages:
                    if page in item and item not in dup_lines:
                        f.write(f'{item}\n')
                        dup_lines.add(item)

if __name__ == '__main__':
    separate_data()
    print("Dataset created.")
