import matplotlib.pyplot as plt
import numpy as np
import spacy

wordlist = []

nlp = spacy.load('en_core_web_lg')

with open('wordlists/wordlist_alphabetic.txt') as f:
    for word in f:
        wordlist.append(word.rstrip())

valid_words = []
invalid_words = []

for word in wordlist:
    for token in nlp(word):
        if token.is_oov:
            invalid_words.append(word)
        else:
            valid_words.append(word)

with open('wordlists/alphabetic_valid.txt', 'w') as f:
    for word in valid_words:
        f.write(f'{word}\n')

print(f'Valid words: {len(valid_words)}')
print(f'Invalid words: {len(invalid_words)}')

labels = 'Valid', 'Invalid'
fracs = [len(valid_words), len(invalid_words)]
plt.pie(fracs, labels=labels, autopct='%1.1f%%', shadow=True)
plt.title('Proportion of Valid and Invalid Words of CoDA')

plt.savefig('lex_figs/alphabetic_valid.png')
