import os, glob
import re

# for plotting
import matplotlib.pyplot as plt
import numpy as np

from lexicalrichness import LexicalRichness
from textwrap import wrap

domains = list()

# list of measurements
word_counts = list()
uniq_counts = list()
ttr_ratio = list()
mattr_ratio = list()

path = 'lex_stats_DUTA'
try:
    os.mkdir(path)
except OSError:
    print('Directory already created.')

fw = open(os.path.join(path, 'lex_stats_masked_all.txt'), 'w')
fs = open(os.path.join(path, 'lex_stats_masked_concise.txt'), 'w')

fw.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')
fs.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')

datapath = 'DUTA_10K_Masked_Dataset'
catpath = [os.path.join(datapath, cat) for cat in os.listdir(datapath)
                                       if os.path.isdir(os.path.join(datapath, cat))]

for domainpath in catpath:
    
    datapaths = [os.path.join(domainpath, data) for data in os.listdir(domainpath)
                                        if os.path.isfile(os.path.join(domainpath, data))]

    if len(datapaths) < 300:
        print(f'Skip {domainpath}')
        continue

    current_domain = re.search('/(.+?)$', domainpath).group(1)
    domains.append(current_domain)
    
    fw.write(f'\n-*-*-*-*-{current_domain}-*-*-*-*-\n\n')
    fs.write(f'\n-*-*-*-*-{current_domain}-*-*-*-*-\n\n')

    print(f'Parsing & analyzing {current_domain}...')

    wordcnt = list()
    uniqcnt = list()
    ttr = list()
    mattr = list()

    for onion in datapaths:

        with open(onion, 'r') as f:
            
            onion_url = re.search('/(.+?).txt', onion).group(1)
            fw.write(onion_url + '\n')

            content = f.read()
            
            lex = LexicalRichness(content)
           
            if (lex.words > 0):
                
                fw.write('[{:<8}, {:<8}, {:.3f}]\n'.format(lex.words, lex.terms, lex.ttr))

                wordcnt.append(lex.words)
                uniqcnt.append(lex.terms)
                ttr.append(lex.ttr)

                if (lex.words > 200):
                    mattr.append(lex.mattr(window_size=200))
                else:
                    mattr.append(lex.mattr(window_size=lex.words))

    wordcnt_arr = np.array(wordcnt)
    uniqcnt_arr = np.array(uniqcnt)
    ttr_arr = np.array(ttr)
    mattr_arr = np.array(mattr)

    fs.write('Mean    : [{:<10.2f}, {:<8.2f}, {:.3f}]\n'.format(wordcnt_arr.mean(),
                                                          uniqcnt_arr.mean(),
                                                          ttr_arr.mean()))
    fs.write('Median  : [{:<10}, {:<8}, {:.3f}]\n'.format(np.median(wordcnt_arr),
                                                          np.median(uniqcnt_arr),
                                                          np.median(ttr_arr)))
    fs.write('Min     : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amin(wordcnt_arr),
                                                          np.amin(uniqcnt_arr),
                                                          np.amin(ttr_arr)))
    fs.write('Max     : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amax(wordcnt_arr),
                                                          np.amax(uniqcnt_arr),
                                                          np.amax(ttr_arr)))

    word_counts.append(wordcnt_arr)
    uniq_counts.append(uniqcnt_arr)
    ttr_ratio.append(ttr_arr)
    mattr_ratio.append(mattr_arr)
                    

fw.close()
fs.close()

# boxplot for statistics

domains = ['\n'.join(wrap(domain, 20)) for domain in domains]

fig_path = 'lexical_figs'
try:
    os.mkdir(fig_path)
except OSError:
    print('Directory already created.')

fig_wordcnt, wc_x = plt.subplots()
wc_x.set(
    title='Word count for each DUTA category (with token masking)',
    ylabel='Word count'
)
wc_x.boxplot(word_counts, showfliers=False)
wc_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_wordcnt.set_size_inches(42,8)
fig_wordcnt.tight_layout()

fig_uniqcnt, uq_x = plt.subplots()
uq_x.set(
    title='Unique word count for each DUTA category (with token masking)',
    ylabel='Unique word count'
)
uq_x.boxplot(uniq_counts, showfliers=False)
uq_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_uniqcnt.set_size_inches(42,8)
fig_uniqcnt.tight_layout()

fig_ttr, ttr_x = plt.subplots()
ttr_x.set(
    title='TTR for each DUTA category (with token masking)',
    ylabel='Type-Token Ratio'
)
ttr_x.boxplot(ttr_ratio)
ttr_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_ttr.set_size_inches(42,8)
fig_ttr.tight_layout()

fig_mattr, mattr_x = plt.subplots()
mattr_x.set(
    title='MATTR for each DUTA category (with token masking)',
    ylabel='Moving Average Type-Token Ratio'
)
mattr_x.boxplot(mattr_ratio)
mattr_x.set_xticklabels(domains, rotation=90, fontsize=8)
fig_mattr.set_size_inches(20,8)
fig_mattr.tight_layout()


fig_wordcnt.savefig('lexical_figs/wordcnt_bydomain_masked.pdf')
fig_uniqcnt.savefig('lexical_figs/uniqcnt_bydomain_masked.pdf')
fig_ttr.savefig('lexical_figs/ttr_bydomain_masked.pdf')
fig_mattr.savefig('lexical_figs/mattr_bydomain_masked.pdf')
