#!/usr/bin/python3
# Get the distribution of masked content in the dataset 
# -> Only use sites labeled as english

import os
import re
import pprint

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

os.chdir('..')

PATH_BASE = os.getcwd()
PATH_ANALYSIS = os.path.join(PATH_BASE, "corpus_analysis")
PATH_CATEGORY = os.path.join(PATH_ANALYSIS, "categories")
PATH_LEX = os.path.join(PATH_ANALYSIS, "lex_stats")
PATH_FIGURE = os.path.join(PATH_ANALYSIS, "lex_figs")

###################
PATH_CSV = os.path.join(PATH_LEX, "mask_dist")
###################

try:
    os.mkdir(PATH_LEX)
except OSError:
    print("Directory already exists")

###################
try:
    os.mkdir(PATH_CSV)
except OSError:
    print("Directory already exists")
###################

# list of measurements

# list of masked tokens
identifier_tokens = ['ID_EMAIL', 'ID_ONION_URL', 'ID_NORMAL_URL', 'ID_IP_ADDRESS',
                     'ID_BTC_ADDRESS', 'ID_ETH_ADDRESS', 'ID_LTC_ADDRESS',
                     'ID_CRYPTO_MONEY', 'ID_GENERAL_MONEY', 'ID_LENGTH', 'ID_WEIGHT',
                     'ID_VOLUME', 'ID_PERCENTAGE', 'ID_VERSION', 'ID_FILENAME',
                     'ID_FILESIZE', 'ID_TIME']


fw = open(os.path.join(PATH_LEX, 'mask_dist_all.txt'), 'w')
fs = open(os.path.join(PATH_LEX, 'mask_dist_concise.txt'), 'w')

categories = os.listdir(PATH_CATEGORY)

token_stats = dict()

for category in categories:

    PATH_CUR_CATEGORY = os.path.join(PATH_CATEGORY, category)

    fw.write(f'\n-*-*-*-*-{category}-*-*-*-*-\n\n')

    current_category_files = os.listdir(PATH_CUR_CATEGORY)
    num_files = len(current_category_files)

    token_cnt = {k : 0 for k in identifier_tokens}
    
    for files in current_category_files:

        filepath = os.path.join(PATH_CUR_CATEGORY, files)

        with open(filepath, 'r') as f:

            fw.write(f'{files}:\n')

            content = f.read()
        

            if content is not None:
                
                # content_parsed = [x for x in re.split('([^a-zA-Z0-9_])', content) if x and x != ' ']
                # print(content_parsed)                
                for identifier in identifier_tokens:

                    res = re.findall(identifier, content)
                    token_cnt[identifier] += len(res)

                    fw.write(f'{identifier:<17}: {len(res)}\n')

            fw.write('\n')

    ###################
    token_cnt_norm = {k : v / num_files for k, v in token_cnt.items()}
    token_sum = sum([count for _, count in token_cnt_norm.items()])
    token_cnt_ratio = {k : v / token_sum for k, v in token_cnt_norm.items()}
    token_stats[category] = token_cnt_ratio
    ###################

############### EXPORT TO CSV #################
for category in categories:

    csv_file_name = os.path.join(PATH_CSV, category + "_mask.csv")

    df = pd.DataFrame.from_dict(token_stats[category], orient='index')
    df.to_csv(csv_file_name)

    print(f'{csv_file_name} created.')


for key, value in token_stats.items():
    
    fs.write(f'\n-*-*-*-*-{key}-*-*-*-*-\n\n')

    for k, v in value.items():

        fs.write(f'{k:<17}: {v:.3f}\n')
    
    fs.write('\n')

# pp = pprint.PrettyPrinter(indent=4)
# pp.pprint(token_stats)


    # fs.write('Mean   : [{:<10.2f}, {:<8.2f}, {:.3f}]\n'.format(wordcnt_arr.mean(), \
    #                                                            uniqcnt_arr.mean(), \
    #                                                            ttr_arr.mean()))
    # fs.write('Median : [{:<10}, {:<8}, {:.3f}]\n'.format(np.median(wordcnt_arr), \
    #                                                      np.median(uniqcnt_arr), \
    #                                                      np.median(ttr_arr)))
    # fs.write('Min    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amin(wordcnt_arr), \
    #                                                      np.amin(uniqcnt_arr), \
    #                                                      np.amin(ttr_arr)))
    # fs.write('Max    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amax(wordcnt_arr), \
    #                                                      np.amax(uniqcnt_arr), \
    #                                                      np.amax(ttr_arr)))

fw.close()
fs.close()

# boxplot for statistics

# try:
#     os.mkdir(PATH_FIGURE)
# except OSError:
#     print('Directory already created')

# fig_wordcnt, wc_x = plt.subplots()
# wc_x.set(
#     title='Word count for each CoDA category',
#     ylabel='Word count'
# )
# wc_x.boxplot(word_counts, showfliers=False)
# wc_x.set_xticklabels(categories, rotation=60, fontsize=6)
# fig_wordcnt.set_size_inches(42,8)
# fig_wordcnt.tight_layout()

# fig_uniqcnt, uq_x = plt.subplots()
# uq_x.set(
#     title='Unique word count for each CoDA category',
#     ylabel='Unique word count'
# )
# uq_x.boxplot(uniq_counts, showfliers=False)
# uq_x.set_xticklabels(categories, rotation=60, fontsize=6)
# fig_uniqcnt.set_size_inches(42,8)
# fig_uniqcnt.tight_layout()

# fig_ttr, ttr_x = plt.subplots()
# ttr_x.set(
#     title='TTR for each CoDA category',
#     ylabel='Type-Token Ratio'
# )
# ttr_x.boxplot(ttr_ratio)
# ttr_x.set_xticklabels(categories, rotation=60, fontsize=6)
# fig_ttr.set_size_inches(42,8)
# fig_ttr.tight_layout()

# fig_wordcnt.savefig(os.path.join(PATH_FIGURE, 'wordcnt_bydomain.png'), dpi=160)
# fig_uniqcnt.savefig(os.path.join(PATH_FIGURE, 'uniqcnt_bydomain.png'), dpi=160)
# fig_ttr.savefig(os.path.join(PATH_FIGURE, 'ttr_bydomain.png'), dpi=160)

