#!/usr/bin/python3
# Get distribution of most frequently occuring words
# -> Only use alphabetic valid words

import os
import copy
import nltk
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import numpy as np


PATH_BASE = os.getcwd()
PATH_WORDLIST = os.path.join(PATH_BASE, "wordlists")
PATH_LEX = os.path.join(PATH_BASE, "lex_stats")

FILE_VALIDS = os.path.join(PATH_WORDLIST, "alphabetic_valid_sorted.txt")
FILE_DEST = os.path.join(PATH_LEX, "most_frequent_words.txt")
FILE_DEST_STOP = os.path.join(PATH_LEX, "most_frequent_words_wostop.txt")
FILE_TEMPLIST = os.path.join(PATH_WORDLIST, "temp_lower.txt")

with open(FILE_VALIDS, 'r') as f:
    valid_words = f.readlines()
    valid_words = [word.strip() for word in valid_words]

occurrences = {word : 0 for word in valid_words}

with open(FILE_TEMPLIST, 'r') as f:
    all_words = f.readlines()
    all_words = [word.strip() for word in all_words]

print("checkpoint")

for word in all_words:
    try:
        occurrences[word] += 1
    except KeyError:
        pass

occurrences_sorted = dict(sorted(occurrences.items(), key=lambda item: item[1], reverse=True)) 

occurrences_no_stopwords = copy.deepcopy(occurrences_sorted)
stopwords = stopwords.words('english')

for stopword in stopwords:
    occurrences_no_stopwords.pop(stopword, None)

fdest = open(FILE_DEST, 'w')
fdest.write('Most common words in CoDA\n')
fdest.write('Word : Frequency\n\n')

for word, freq in occurrences_sorted.items():
    fdest.write(f'{word:<20} : {freq}\n')

fdest.close()

fdest = open(FILE_DEST_STOP, 'w')
fdest.write('Most common words in CoDA excluding stopwords\n')
fdest.write('Word : Frequency\n\n')

for word, freq in occurrences_no_stopwords.items():
    fdest.write(f'{word:<20} : {freq}\n')

fdest.close()
