import logging
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

from string import punctuation
from nltk import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
# from sklearn.datasets import fetch_20newsgroups

def fetch_20newsgroups():
    newsgroups = []
    with open("datasets/20news/text.txt", 'r') as f:
        for line in f:
            newsgroups.append(line.rstrip())
    return newsgroups
        

newsgroups = fetch_20newsgroups()
nltk.download('stopwords')
eng_stopwords = set(stopwords.words('english'))

tokenizer = RegexpTokenizer(r'\s+', gaps=True)
stemmer = PorterStemmer()
translate_tab = {ord(p): u" " for p in punctuation}

def text2tokens(raw_text):
    """Split the raw_text string into a list of stemmed tokens."""
    clean_text = raw_text.lower().translate(translate_tab)
    tokens = [token.strip() for token in tokenizer.tokenize(clean_text)]
    tokens = [token for token in tokens if token not in eng_stopwords]
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return [token for token in stemmed_tokens if len(token) > 2]  # skip short tokens

dataset = [text2tokens(txt) for txt in newsgroups] #newsgroups['data']]  # convert a documents to list of tokens

from gensim.corpora import Dictionary
dictionary = Dictionary(documents=dataset, prune_at=None)
dictionary.filter_extremes(no_below=5, no_above=0.3, keep_n=None)  # use Dictionary to remove un-relevant tokens
dictionary.compactify()

d2b_dataset = [dictionary.doc2bow(doc) for doc in dataset]  # convert list of token

from gensim.models import LdaMulticore
num_topics = 15

ldamodel = LdaMulticore(
    corpus=d2b_dataset, num_topics=num_topics, id2word=dictionary,
    workers=4, eval_every=None, passes=20, batch=True,
)

#print(ldamodel.print_topics(num_topics=30, num_words=30))

for index, topic in ldamodel.show_topics(formatted=False, num_topics=num_topics, num_words= 30):
    print('Topic: {}, Words: {} \n'.format(index, [w[0] for w in topic]))

count = 0
cnt = 0
for i in ldamodel[d2b_dataset]:
    print('doc:', count,i)
    count += 1
    cnt += 1
    if cnt >= 10:
        break
