import matplotlib.pyplot as plt
import numpy as np
from spellchecker import SpellChecker

wordlist = []

spell = SpellChecker()

with open('wordlists/wordlist_surf.txt') as f:
    for word in f:
        wordlist.append(word.rstrip())

valid_words = spell.known(wordlist)
invalid_words = spell.unknown(wordlist)

with open('wordlists/surf_valid.txt', 'w') as f:
    for word in valid_words:
        f.write(f'{word}\n')

with open('wordlists/surf_invalid.txt', 'w') as f:
    for word in invalid_words:
        f.write(f'{word}\n')

print(f'Valid words: {len(valid_words)}')
print(f'Invalid words: {len(invalid_words)}')

labels = 'Valid', 'Invalid'
fracs = [len(valid_words), len(invalid_words)]
plt.pie(fracs, labels=labels, autopct='%1.1f%%', shadow=True)
plt.title('Proportion of Valid and Invalid Words of Surface Web Datasets')

plt.savefig('CoDA_figs/surf_valid.png')