import os
import re

import spacy

path = "categories_onetext"

textfiles = os.listdir(path)
textfiles = [os.path.join(path, fname) for fname in textfiles]

_RE_WHITESPACE = re.compile(r"\s+")

for fname in textfiles:

    with open(fname, 'r') as f:
        data = f.readlines()
    
    processed_data = [_RE_WHITESPACE.sub(" ", line).strip() + '\n' for line in data]

    with open(fname, 'w') as f:
        f.writelines(processed_data)

print("Removing whitespace... done")

nlp = spacy.load("en_core_web_lg", exclude=["ner"])
nlp.max_length = 10000000

for fname in textfiles:

    print(f'Lemmatizing {fname}...')

    if "cryptocurrency_en_all.txt" in fname:
        print("Too long to lemmatize")
        continue

    with open(fname, 'r') as f:
        data = f.read()

        doc = nlp(data)
    
    lemmatized_doc = " ".join([token.lemma_ for token in doc])

    with open(fname, 'w') as f:
        f.write(lemmatized_doc)