import spacy
import os, glob
import itertools

nlp = spacy.load('en_core_web_lg')

# Get all text files

filelist = glob.glob('DUTA_10K_Dataset/**/*.txt', recursive=True)

print(len(filelist))

# List files by filesize
filesize = {key : os.path.getsize(key) for key in filelist}
filesize = {key : value for key, value in sorted(filesize.items(), key=lambda item: item[1])}

prev = -1
dupcnt = 0
for val in filesize.values():
    if (prev == val):
        dupcnt += 1
    prev = val

print(f'Number of duplicates: {dupcnt}')

# Get document similarity for all files
# if similarity > threshold, save the pair
similar_pairs = []

for pair in itertools.combinations(filelist, 2):
    
    file1 = open(pair[0], mode='r')
    txt1 = file1.read()
    file1.close()
    
    file2 = open(pair[1], mode='r')
    txt2 = file2.read()
    file2.close()

    # Skip if length of text is too long
    if (len(txt1) >= 1000000 or len(txt2) >= 1000000):
        continue

    doc1 = nlp(txt1)
    doc2 = nlp(txt2)

    if (doc1.similarity(doc2) >= 0.98):
        print(pair)
        similar_pairs.append(pair)

