from match import SimpleTokenizer, has_answer
import json
#has_answer(answers, text, tokenizer) -> bool:
tokenizer = SimpleTokenizer()

data = json.load(open('/data/timchen0618/open_domain_data/NQ/dev.json'))
pred = [l.strip('\n').split('\t')[1] for l in open('/data2/timchen0618/models/ODQA/FiD/checkpoint/dev_rand/final_output.txt')]
assert len(data) == len(pred)

print(pred[0])
inds = []
total = 0
n_ans = 0
for i in range(len(data)):
    ctxs = [l['text'] for l in data[i]['ctxs']]
    h_a = False
    for c in ctxs:
        if has_answer([pred[i]], c, tokenizer):
            h_a = True
            break
    if h_a:
        n_ans += 1
        inds.append(i)

    total += 1

print(n_ans, total, n_ans/float(total))
fw = open('extractive_indices.txt', 'w')
for l in inds:
    fw.write(str(l) + '\n')
