import json

out_fp = open('newsspike_gen8_with_translations.jsonl', 'w', encoding='utf8')

all_entries = {}
swap_count = 0

with open('newsspike_gen8_with_translations_before_combing.jsonl', 'r', encoding='utf8') as in_fp:
	for lidx, line in enumerate(in_fp):
		if lidx % 10000 == 0:
			print(lidx)
		item = json.loads(line)
		if item['articleId'] not in all_entries:
			all_entries[item['articleId']] = {}
		if item['lineId'] in all_entries[item['articleId']]:
			# print(f"Swapping!")
			swap_count += 1
		all_entries[item['articleId']][item['lineId']] = line

last_articleId = None
print(f"Swap count: {swap_count}")

all_entries = {k: v for k, v in sorted(all_entries.items(), key=lambda it: int(it[0]))}

for articleId in all_entries:
	if last_articleId is not None:
		if int(articleId) != int(last_articleId) + 1:
			print(f"non-consecutive articleIds! {last_articleId} -> {articleId}")
	last_articleId = articleId

	if int(articleId) % 1000 == 0:
		print(articleId)

	article = all_entries[articleId]
	article = {k: v for k, v in sorted(article.items(), key=lambda it: int(it[0]))}
	for lineId in article:
		out_fp.write(article[lineId].strip('\n')+'\n')


out_fp.close()