import gzip
import json
from os import path, listdir
from sys import getsizeof
import _pickle as cPickle
from collections import Counter
from nltk import word_tokenize
from gensim import utils

#Process Wikipedia
SOURCE_DIR =''
WRITE_DIR = ''
token_counter = Counter()
last_title = []
last_index = []
write_number = 0

with utils.open(path.join(SOURCE_DIR, f'enwiki-2017920.json.gz'), 'rb') as reader:

    for idx, k in enumerate(reader):
        article = json.loads(k)
        title = article['title']
        texts = article['section_texts']

        for text in texts:
            tokenized_text = word_tokenize(text)
            token_count = Counter(tokenized_text)
            token_counter.update(token_count)
        counter_size = getsizeof(token_counter)

        if counter_size > 100000000:
            last_title.append(title)
            last_index.append(idx)

            with open(path.join(WRITE_DIR, f'wiki_count_{write_number}.pkl'), 'wb') as pkl_writer:
                cPickle.dump(token_counter, pkl_writer)
            with open(path.join(WRITE_DIR, f'last_titles.pkl'), 'wb') as pkl_writer:
                cPickle.dump(last_title, pkl_writer)
            with open(path.join(WRITE_DIR, f'last_idx.pkl'), 'wb') as pkl_writer:
                cPickle.dump(last_index, pkl_writer)
            
            write_number += 1
            token_counter = Counter()
            print(f'dumped_{write_number}')

    with open(path.join(WRITE_DIR, f'wiki_count_final.pkl'), 'wb') as pkl_writer:
        cPickle.dump(token_counter, pkl_writer)

#Get Wikipedia name counts
with open(f'names.pkl', 'rb') as pkl_reader:
    names = cPickle.load(pkl_reader)

name_freq_dict = {i:0 for i in names}

count_files = listdir(WRITE_DIR)

for f in count_files:
    with open(path.join(WRITE_DIR, f), 'rb') as pkl_reader:
        count_data = cPickle.load(pkl_reader)
        for name in name_freq_dict.keys():
            name_freq_dict[name] += count_data[name]
    print(f)

with open(path.join(WRITE_DIR, f'wikipedia_name_counts.pkl'), 'wb') as pkl_writer:
    cPickle.dump(name_freq_dict, pkl_writer)

#Combine Wikipedia and Book Corpus counts
SOURCE_DIR = 'D:\\datasets\\english_wikipedia_2017\\enwiki-20170920'
WRITE_DIR = f'D:\\datasets\\english_wikipedia_2017\\counts'

with open(path.join(WRITE_DIR, f'wikipedia_name_counts.pkl'), 'rb') as pkl_reader:
    wiki_names = cPickle.load(pkl_reader)

bert_dict = dict(wiki_names)

with open(f'D:\\datasets\\book_corpus_name_frequencies.pkl', 'rb') as pkl_reader:
    book_names = cPickle.load(pkl_reader)

for name in bert_dict.keys():
    bert_dict[name] += book_names[name]

with open(f'D:\\bert_name_frequencies.pkl', 'wb') as pkl_writer:
    cPickle.dump(bert_dict, pkl_writer)

#Count C4 tokens
t_dir = ''
w_dir = ''

gathered = [i[10:] for i in listdir(w_dir)]
dir_cont = [i for i in listdir(t_dir) if i not in gathered]

for f_name in dir_cont:
    with gzip.open(path.join(t_dir, f_name), 'r') as reader:
        data = json.loads(json.dumps(reader.read().decode('utf-8')))
    data = word_tokenize(data)
    subcount = Counter(data)
    with open(path.join(w_dir, f'c4_count_{f_name}.pkl'), 'wb') as pkl_writer:
        cPickle.dump(subcount, pkl_writer)

#Get C4 name counts
target_dir = ''

with open(f'names.pkl', 'rb') as pkl_reader:
    names = cPickle.load(pkl_reader)

name_freq_dict = {i:0 for i in names}
count_files = listdir(target_dir)
count_files = [i for i in count_files if 'validation' not in i]

for f in count_files:
    with open(path.join(target_dir, f), 'rb') as pkl_reader:
        count_data = cPickle.load(pkl_reader)
        for name in name_freq_dict.keys():
            name_freq_dict[name] += count_data[name]

with open(f'c4_name_counts.pkl', 'wb') as pkl_writer:
    cPickle.dump(name_freq_dict, pkl_writer)