import json
import transformers

'''
def write_out_item(out_item, out_fp, tokenizer, normalizer):
	out_item['splitted_text'] = normalizer.tok_norm(out_item['splitted_text'], tokenizer)
	out_line = json.dumps(out_item, ensure_ascii=False)
	out_fp.write(out_line + '\n')
'''


def write_out_item(out_item, out_fp):
	out_line = json.dumps(out_item, ensure_ascii=False)
	out_fp.write(out_line + '\n')


def main():
	input_fp = open('newsspike_gen8_with_translations.jsonl', 'r', encoding='utf8')
	output_fp = open('newsspike_data_entries.jsonl', 'w', encoding='utf8')
	total_translation_failures = 0
	num_entries = 0

	cur_article_id = 0
	out_item = None

	for lidx, line in enumerate(input_fp):
		if lidx % 100000 == 0:
			print(lidx)

		item = json.loads(line)

		if len(item['trans_s']) == 0:
			total_translation_failures += 1
			continue

		if int(item['articleId']) < cur_article_id:
			print(item)
			continue
		elif int(item['articleId']) != cur_article_id:
			if out_item is not None:
				#write_out_item(out_item, output_fp, tokenizer, normalizer)
				write_out_item(out_item, output_fp)
				num_entries += 1
				#if max(out_item['split_mapping'])+1 != len(out_item['split_mapping']):
				#	print(out_item['split_mapping'])
			out_item = {'text': '', 'splitted_text': [], 'split_mapping': [], 'num_english_sents': 0}
			cur_article_id = int(item['articleId'])

		out_item['text'] += '\n' + item['trans_s']
		out_item['splitted_text'].append(item['trans_s'])
		out_item['published'] = item['date']
		out_item['crawled'] = item['date']
		out_item['articleId'] = cur_article_id
		out_item['split_mapping'].append(item['lineId'])

	#write_out_item(out_item, output_fp, tokenizer, normalizer)
	write_out_item(out_item, output_fp)
	num_entries += 1

	print(f"Total number of translation failures: {total_translation_failures}")
	print(f"Total number of entries: {num_entries}")
	input_fp.close()
	output_fp.close()


if __name__ == '__main__':
	main()
