from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords

import numpy as np
import os
import re

import pandas as pd

min_freq = 3

dump_path = ["wordlists/surf_worddump.txt"]

#vectorizer = TfidfVectorizer(input='filename', strip_accents='ascii', min_df=2, stop_words=mask_tokens)
#vectorizer = CountVectorizer(input='filename', strip_accents='ascii', min_df=min_freq)
vectorizer = CountVectorizer(input='filename', strip_accents='ascii')
count_vectors = vectorizer.fit_transform(dump_path)
feature_arr = vectorizer.get_feature_names()

# tfidf_pairs = zip(feature_arr, np.asarray(count_vectors.sum(axis=0)).ravel())

# get vectors
doc_count = count_vectors.shape[0]
assert(doc_count == 1)

commonwords_path = "lexstats_surface"
try:
    os.mkdir(commonwords_path)
except OSError:
    print("Common words directory already created.")

for i in range(doc_count):

    tfidf_vec = count_vectors[i]

    csv_file_name = os.path.join(commonwords_path, 'most_common_validinvalid.csv')

    df = pd.DataFrame(tfidf_vec.T.todense(), index=feature_arr, columns=["count"])
    df = df.sort_values(by=["count"], ascending=False)
    df.to_csv(csv_file_name)

    print(f'{csv_file_name} created.')

# Get most common invalid words
# run surf_common_invalids.py