import os
import json
import jsonlines
import tqdm

dir_path_input = './in-language/'

#check_file = 'train.jsonl'
check_file = 'val.jsonl'

files = os.listdir(dir_path_input)

record = dict()

for fname in files:
    if 'train' in fname or 'val' in fname: continue

    print('*****************************')
    print(fname)
    
    queries = []
    with open(dir_path_input + fname) as f:
        for item in jsonlines.Reader(f):
            queries.append(item['query'])

    record[fname.split('.')[0]] = queries

count = dict()
overlap = dict()
for name in ['bn', 'fi', 'ja', 'ru', 'te']:
    count[name] = 0
    overlap[name] = []
with open(dir_path_input + check_file) as f:
    for item in jsonlines.Reader(f):
        if item['query'] in record[item["query_language"]]:
            count[item["query_language"]] += 1
            overlap[item["query_language"]].append(item['query'])

print(count)
print(overlap)
