'''
For getting English only data in DUTA-10K
'''

import csv
import glob

en_list = []

with open('DUTA_10K.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')

    header = next(csv_reader)
    for row in csv_reader:
        if row[4] == 'en':
            en_list.append(row[1])

data = glob.glob('DUTA_10K_Dataset/*/*')
txt_data = [item for item in data if item.lower().endswith('txt')]

dup_lines = set()

with open('english_duta.txt', 'w') as f:
    for fname in txt_data:
        for token in en_list:
            if token in fname:
                if fname not in dup_lines:
                    f.write(f'{fname}\n')
                    dup_lines.add(fname)
