import os, glob
import re

# for plotting
import matplotlib.pyplot as plt
import numpy as np

from lexicalrichness import LexicalRichness


sites_bydomain = glob.glob('classes/*.txt')

encodings = ['utf-8', 'windows-1250']
#encodings = ['utf-8']

domains = list()

# list of measurements
word_counts = list()
uniq_counts = list()
ttr_ratio = list()

path = 'lex_stats_DUTA'
try:
    os.mkdir(path)
except OSError:
    print('Directory already created')

fw = open(os.path.join(path, 'lex_stats_all.txt'), 'w')
fs = open(os.path.join(path, 'lex_stats_concise.txt'), 'w')

fw.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')
fs.write('[TOTAL WORDS\tUNIQUE WORDS\tTYPE TOKEN RATIO]')

for domain in sites_bydomain:

    current_domain = re.search('classes/(.+?).txt', domain).group(1)
    domains.append(current_domain)
    
    fw.write(f'\n-*-*-*-*-{current_domain}-*-*-*-*-\n\n')
    fs.write(f'\n-*-*-*-*-{current_domain}-*-*-*-*-\n\n')

    with open(domain, 'r') as sites:
       
        valid_pages = 0
        wordcnt = list()
        uniqcnt = list()
        ttr = list()

        onions = sites.readlines()
        for onion in onions:
            onion = onion.rstrip('\n')

            onion_url = re.search('/(.+?)/', onion).group(1)
            fw.write(onion_url + '\n')

            content = None
            
            for e in encodings:
                try:
                    with open(onion, 'r', encoding=e) as f:
                        content = f.read()
                except UnicodeDecodeError:
                    pass
                    #print(f'Got unicode error with {e}, trying a different encoding')
                else:
                    # encoding works
                    break

            if content is not None:
                lex = LexicalRichness(content)
               
                if (lex.words > 0):
                    
                    valid_pages += 1

                    fw.write('[{:<8}, {:<8}, {:.3f}]\n'.format(lex.words, lex.terms, lex.ttr))

                    wordcnt.append(lex.words)
                    uniqcnt.append(lex.terms)
                    ttr.append(lex.ttr)

        wordcnt_arr = np.array(wordcnt)
        uniqcnt_arr = np.array(uniqcnt)
        ttr_arr = np.array(ttr)

        fs.write('Mean   : [{:<10.2f}, {:<8.2f}, {:.3f}]\n'.format(wordcnt_arr.mean(), \
                                                            uniqcnt_arr.mean(), \
                                                            ttr_arr.mean()))
        fs.write('Median : [{:<10}, {:<8}, {:.3f}]\n'.format(np.median(wordcnt_arr), \
                                                             np.median(uniqcnt_arr), \
                                                             np.median(ttr_arr)))
        fs.write('Min    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amin(wordcnt_arr), \
                                                             np.amin(uniqcnt_arr), \
                                                             np.amin(ttr_arr)))
        fs.write('Max    : [{:<10}, {:<8}, {:.3f}]\n'.format(np.amax(wordcnt_arr), \
                                                             np.amax(uniqcnt_arr), \
                                                             np.amax(ttr_arr)))

        word_counts.append(wordcnt_arr)
        uniq_counts.append(uniqcnt_arr)
        ttr_ratio.append(ttr_arr)
        # print(f'{valid_pages} valid pages in {current_domain}')

fw.close()
fs.close()

# boxplot for statistics

fig_path = 'lexical_figs'
try:
    os.mkdir(fig_path)
except OSError:
    print('Directory already created.')

fig_wordcnt, wc_x = plt.subplots()
wc_x.set(
    title='Word count for each DUTA category',
    ylabel='Word count'
)
wc_x.boxplot(word_counts, showfliers=False)
wc_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_wordcnt.set_size_inches(42,8)
fig_wordcnt.tight_layout()

fig_uniqcnt, uq_x = plt.subplots()
uq_x.set(
    title='Unique word count for each DUTA category',
    ylabel='Unique word count'
)
uq_x.boxplot(uniq_counts, showfliers=False)
uq_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_uniqcnt.set_size_inches(42,8)
fig_uniqcnt.tight_layout()

fig_ttr, ttr_x = plt.subplots()
ttr_x.set(
    title='TTR for each DUTA category',
    ylabel='Type-Token Ratio'
)
ttr_x.boxplot(ttr_ratio)
ttr_x.set_xticklabels(domains, rotation=60, fontsize=6)
fig_ttr.set_size_inches(42,8)
fig_ttr.tight_layout()

fig_wordcnt.savefig('lexical_figs/wordcnt_bydomain.png', dpi=160)
fig_uniqcnt.savefig('lexical_figs/uniqcnt_bydomain.png', dpi=160)
fig_ttr.savefig('lexical_figs/ttr_bydomain.png', dpi=160)
