import os
import re

import spacy

# class LemmaTokenizer:
#     def __init__(self):
#         self.nlp = spacy.load('en_core_web_lg', exclude=["ner"])
#         self.nlp.max_length = 3000000
#     def __call__(self, doc):
#         spacy_doc = self.nlp(doc)
#         return [token.lemma_ for token in spacy_doc if (len(token.lemma_) > 1) or (token.lemma_.isalnum())]

path = "categories_onetext_nonproc"

textfiles = os.listdir(path)
textfiles = [os.path.join(path, fname) for fname in textfiles]

_RE_WHITESPACE = re.compile(r"\s+")

for fname in textfiles:

    with open(fname, 'r') as f:
        data = f.readlines()

    processed_data = [_RE_WHITESPACE.sub(" ", line).strip() + '\n' for line in data]

    with open(fname, 'w') as f:
        f.writelines(processed_data)

print("Removing whitespace... done")

nlp = spacy.load("en_core_web_lg", exclude=["ner"])
nlp.max_length = 10000000

# lemmatize the processed data
for fname in textfiles:

    print(f'Lemmatizing {fname}...')

    with open(fname, 'r') as f:
        data = f.read()

        # doc = nlp(data)
    
    l = data.split()
    words = 10000
    data_split = [' '.join(l[x:x+words]) for x in range(0, len(l), words)]

    lemmatized_doc = ""

    for i in range(len(data_split)):
        if i % 50 == 0:
            print(f'{i} / {len(data_split) - 1} done.')
        doc = nlp(data_split[i])
        lemmatized_doc += " ".join([token.lemma_ for token in doc])
    
    with open(fname, 'w') as f:
        f.write(lemmatized_doc)
