import json


def check_webhose():
	transed_fn = './webhose_data_entries_with_translations_blank_filled.jsonl'
	raw_fn = '/Users/teddy/Files/Potential Corpus/WebHoses_Chinese_News_Articles/webhose_data_entries_no_corenlp.jsonl'

	transed_num_lines = 0
	with open(transed_fn, 'r', encoding='utf8') as fp:
		for line in fp:
			transed_num_lines += 1

	raw_num_lines = 0
	with open(raw_fn, 'r', encoding='utf8') as fp:
		for line in fp:
			raw_num_lines += 1

	assert transed_num_lines == raw_num_lines

	print(f"Number of documents: {raw_num_lines}")

	transed_fp = open(transed_fn, 'r', encoding='utf8')
	raw_fp = open(raw_fn, 'r', encoding='utf8')

	remaining_mismatch_count = 0

	for lidx, (transed_line, raw_line) in enumerate(zip(transed_fp, raw_fp)):
		if lidx % 10000 == 0:
			print(lidx)
		transed_item = json.loads(transed_line)
		raw_item = json.loads(raw_line)
		assert len(transed_item['splitted_text']) == len(raw_item['splitted_text'])
		if transed_item['translation_mismatch']:
			remaining_mismatch_count += 1
		else:
			assert len(transed_item['english_splitted_text']) == len(transed_item['splitted_text'])
		for t_sent, r_sent in zip(transed_item['splitted_text'], raw_item['splitted_text']):
			assert t_sent == r_sent

	print(f"Remaining mismatch count: {remaining_mismatch_count}")

	transed_fp.close()
	raw_fp.close()


if __name__ == '__main__':
	check_webhose()
