#!/usr/bin/python3
# Make directories per class and put each file in corresponding directory 
# -> Only use sites labeled as english

import os

import matplotlib.pyplot as plt
import numpy as np

from lexicalrichness import LexicalRichness

os.chdir('..')

PATH_BASE = os.getcwd()
PATH_ANALYSIS = os.path.join(PATH_BASE, "corpus_analysis")
PATH_CATEGORY = os.path.join(PATH_ANALYSIS, "categories")
PATH_LEX = os.path.join(PATH_ANALYSIS, "lex_stats")
PATH_FIGURE = os.path.join(PATH_ANALYSIS, "lex_figs")

try:
    os.mkdir(PATH_LEX)
except OSError:
    print("Directory already exists")

# list of measurements
word_counts = list()
uniq_counts = list()
ttr_ratio = list()
mattr_ratio = list()

fw = open(os.path.join(PATH_LEX, 'lex_stats_all.txt'), 'w')
fs = open(os.path.join(PATH_LEX, 'lex_stats_concise.txt'), 'w')

fw.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')
fs.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')

categories = os.listdir(PATH_CATEGORY)

for category in categories:

    PATH_CUR_CATEGORY = os.path.join(PATH_CATEGORY, category)

    fw.write(f'\n-*-*-*-*-{category}-*-*-*-*-\n\n')
    fs.write(f'\n-*-*-*-*-{category}-*-*-*-*-\n\n')
        
    wordcnt = list()
    uniqcnt = list()
    ttr     = list()
    mattr   = list()

    for files in os.listdir(PATH_CUR_CATEGORY):       

        filepath = os.path.join(PATH_CUR_CATEGORY, files)

        with open(filepath, 'r') as f:

            content = f.read()

            if content is not None:
                lex = LexicalRichness(content)

                if (lex.words > 0):

                    fw.write('[{:<8}, {:<8}, {:.3f}]\n'.format(lex.words, lex.terms, lex.ttr))

                    wordcnt.append(lex.words)
                    uniqcnt.append(lex.terms)
                    ttr.append(lex.ttr)

                    if (lex.words > 200):
                        mattr.append(lex.mattr(window_size=200))
                    else:
                        mattr.append(lex.mattr(window_size=lex.words))

    wordcnt_arr = np.array(wordcnt)
    uniqcnt_arr = np.array(uniqcnt)
    ttr_arr = np.array(ttr)
    mattr_arr = np.array(mattr)

    fs.write('Mean   : [{:<10.2f}, {:<8.2f}, {:.3f}]\n'.format(wordcnt_arr.mean(), \
                                                               uniqcnt_arr.mean(), \
                                                               ttr_arr.mean()))
    fs.write('Median : [{:<10}, {:<8}, {:.3f}]\n'.format(np.median(wordcnt_arr), \
                                                         np.median(uniqcnt_arr), \
                                                         np.median(ttr_arr)))
    fs.write('Min    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amin(wordcnt_arr), \
                                                         np.amin(uniqcnt_arr), \
                                                         np.amin(ttr_arr)))
    fs.write('Max    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amax(wordcnt_arr), \
                                                         np.amax(uniqcnt_arr), \
                                                         np.amax(ttr_arr)))

    word_counts.append(wordcnt_arr)
    uniq_counts.append(uniqcnt_arr)
    ttr_ratio.append(ttr_arr)
    mattr_ratio.append(mattr_arr)

fw.close()
fs.close()

# boxplot for statistics

try:
    os.mkdir(PATH_FIGURE)
except OSError:
    print('Directory already created')

fig_wordcnt, wc_x = plt.subplots()
wc_x.set(
    title='Word count for each CoDA category',
    ylabel='Word count'
)
wc_x.boxplot(word_counts, showfliers=False)
wc_x.set_xticklabels(categories, rotation=60, fontsize=6)
fig_wordcnt.set_size_inches(42,8)
fig_wordcnt.tight_layout()

fig_uniqcnt, uq_x = plt.subplots()
uq_x.set(
    title='Unique word count for each CoDA category',
    ylabel='Unique word count'
)
uq_x.boxplot(uniq_counts, showfliers=False)
uq_x.set_xticklabels(categories, rotation=60, fontsize=6)
fig_uniqcnt.set_size_inches(42,8)
fig_uniqcnt.tight_layout()

fig_ttr, ttr_x = plt.subplots()
ttr_x.set(
    title='TTR for each CoDA category',
    ylabel='Type-Token Ratio'
)
ttr_x.boxplot(ttr_ratio)
ttr_x.set_xticklabels(categories, rotation=60, fontsize=6)
fig_ttr.set_size_inches(42,8)
fig_ttr.tight_layout()

fig_mattr, mattr_x = plt.subplots()
mattr_x.set(
    title='MATTR for each CoDA category',
    ylabel='Moving Average Type-Token Ratio'
)
mattr_x.boxplot(mattr_ratio)
mattr_x.set_xticklabels(categories, rotation=90, fontsize=8)
fig_mattr.set_size_inches(12,8)
fig_mattr.tight_layout()

fig_wordcnt.savefig(os.path.join(PATH_FIGURE, 'wordcnt_bydomain.pdf'))
fig_uniqcnt.savefig(os.path.join(PATH_FIGURE, 'uniqcnt_bydomain.pdf'))
fig_ttr.savefig(os.path.join(PATH_FIGURE, 'ttr_bydomain.pdf'))
fig_mattr.savefig(os.path.join(PATH_FIGURE, 'mattr_bydomain.pdf'))

