#!/usr/bin/python3
# Make directories per class and put each file in corresponding directory 
# -> Only use sites labeled as english

import os

os.chdir('..')

PATH_BASE = os.getcwd()
PATH_ANALYSIS = os.path.join(PATH_BASE, "corpus_analysis")
PATH_CATEGORY = os.path.join(PATH_ANALYSIS, "categories_nonprocessed")
PATH_TEXT = os.path.join(PATH_BASE, \
                         "category_annotated_corpus_v4.1_20210513", \
                         "txt")

try:
    os.mkdir(PATH_CATEGORY)
except OSError:
    print("Directory already exists")

categories = list()

for txtfile in os.listdir(PATH_TEXT):
    
    fname_split = txtfile.split('-')
    
    num, category, lang, doc_id = fname_split
    
    PATH_CAT_CURRENT = os.path.join(PATH_CATEGORY, category)

    if category not in categories:
        categories.append(category)
        try:
            os.mkdir(os.path.join(PATH_CAT_CURRENT))
        except OSError:
            print("Directory already exists")

    if lang == 'en':

        src_name = os.path.join(PATH_TEXT, txtfile)
        dst_name = os.path.join(PATH_CAT_CURRENT, txtfile)

        os.symlink(src_name, dst_name)

print("Categorization complete.")
