from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

import nltk
from nltk.corpus import stopwords

import numpy as np
import os
import re

import pandas as pd

min_freq = 3

path = "categories_onetext"

textfiles = os.listdir(path)
textfiles = [os.path.join(path, fname) for fname in textfiles]

domains = [re.search('/(.+?)_en_all.txt', text).group(1) for text in textfiles]
# print(domains)

# masked tokens should be used as stop words
mask_tokens = ['id_email', 'id_onion_url', 'id_normal_url', 'id_ip_address',
               'id_btc_address', 'id_eth_address', 'id_ltc_address',
               'id_crypto_money', 'id_general_money', 'id_length', 'id_weight',
               'id_volume', 'id_percentage', 'id_version', 'id_filename',
               'id_filesize', 'id_time', 'id_brand_name', 'id_number']

stopwords = stopwords.words('english')

#vectorizer = TfidfVectorizer(input='filename', strip_accents='ascii', min_df=2, stop_words=mask_tokens)
vectorizer = CountVectorizer(input='filename', strip_accents='ascii', min_df=min_freq, stop_words=stopwords)
count_vectors = vectorizer.fit_transform(textfiles)
feature_arr = vectorizer.get_feature_names()

# tfidf_pairs = zip(feature_arr, np.asarray(count_vectors.sum(axis=0)).ravel())

# get vectors
doc_count = count_vectors.shape[0]

commonwords_path = "lex_stats/common_words"
try:
    os.mkdir(commonwords_path)
except OSError:
    print("Common words directory already created.")

for i in range(doc_count):

    domain_name = domains[i]
    tfidf_vec = count_vectors[i]

    csv_file_name = os.path.join(commonwords_path, domain_name + '.csv')

    df = pd.DataFrame(tfidf_vec.T.todense(), index=feature_arr, columns=["count"])
    df = df.sort_values(by=["count"], ascending=False)
    df.to_csv(csv_file_name)

    print(f'{csv_file_name} created.')

# count = 0
# for term, freq in tfidf_pairs:
#     print(f'{term} : {freq}')
#     count += 1

# print(f'count = {count}')
