import ddparser
import json
from extract import FineGrainedInfo
from extract import CoarseGrainedInfo
import time
import argparse
import re


# reformat from (([S_id, S], [V_id, V], [O_id, O]), 'SVO') into ((S, V, O), 'SVO', (S_id, V_id, O_id))
def reformat_cur_rels(cur_rels):
	reformatted = []
	for sent_id, sent_rels in enumerate(cur_rels):
		sent_reformatted = []
		for rel in sent_rels:
			new_rel_0 = []
			new_rel_2 = []
			for item in rel[0]:
				if item is None:
					new_rel_0.append(None)
					new_rel_2.append(None)
				else:
					new_rel_0.append(item[1])
					new_rel_2.append(item[0])
			new_rel = (new_rel_0, rel[1], new_rel_2)

			sent_reformatted.append(new_rel)
		reformatted.append(sent_reformatted)

	return reformatted


# OBSOLETE
# add a dummy () into ((S, V, O), 'SVO', ()) in order to maintain same structure as fine_rels
'''
def reformat_coarse_rels(coarse_rels):
	reformatted = []
	for sent_id, sent_rels in enumerate(coarse_rels):
		sent_reformatted = []
		for rel in sent_rels:
			new_rel = [rel[0], rel[1], ()]
			sent_reformatted.append(new_rel)
		reformatted.append(sent_reformatted)
	return reformatted
'''


# merge entries in dict_2 into dict_1
def merge_dict(dict_1, dict_2):
	for key in dict_2:
		if key not in dict_1:
			dict_1[key] = dict_2[key]
		else:
			dict_1[key] += dict_2[key]
	return


# disgard all relation tuples except those triples for SVO relations; pure output for downstream linking & typing.
def only_keep_svo(cur_rels):
	pruned_cur_rels = []
	for sent_id, sent_cur_rels in enumerate(cur_rels):
		sent_pruned_cur_rels = []
		for rel in sent_cur_rels:
			if rel[1] == 'SVO':
				sent_pruned_cur_rels.append(rel)
		pruned_cur_rels.append(sent_pruned_cur_rels)
	return pruned_cur_rels


# for each DOB relation, add another two SVO relations to the relation list (split this DOB up into 2)
def translate_nary_to_binaries(cur_rels):
	translated_cur_rels = []
	for sent_id, sent_cur_rels in enumerate(cur_rels):
		sent_translated_cur_rels = []
		for rel in sent_cur_rels:
			if rel[1] != 'DOB':
				sent_translated_cur_rels.append(rel)
			else:
				rel1 = ((rel[0][0], rel[0][1], rel[0][2]), 'SVO', (rel[2][0], rel[2][1], rel[2][2]))
				rel2 = ((rel[0][0], rel[0][1], rel[0][3]), 'SVO', (rel[2][0], rel[2][1], rel[2][3]))
				sent_translated_cur_rels.append(rel1)
				sent_translated_cur_rels.append(rel2)
				sent_translated_cur_rels.append(rel)
		translated_cur_rels.append(sent_translated_cur_rels)
	return translated_cur_rels


# input: fine_grained and coarse_grained relations in a document, plus some globally-maintained statistics
#
# output: filtered fine and coarse grained relations in a document (a list of sentences), plus some globally-maintained
# statistics
def filter_triples_stopwords(cur_rels, stop_word_list, cur_stop_word_count_bucket=None, cur_number_count=0, MUST_INCLUDE_CHINESE_flag=True):
	filtered_cur_rels = []

	def all_digits(string):
		if string is None or len(string) == 0:
			return False
		digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.']
		for c in string:
			if c not in digits:
				return False
		return True

	def no_chinese_char(string):
		if string is None or len(string) == 0:
			return False
		chinese_character_list = re.findall(r'[\u4e00-\u9fff]+', string)
		chinese_character_list = ''.join(chinese_character_list)
		if len(chinese_character_list) > 0:
			return False
		else:
			return True

	for sid, sent_cur_rels in enumerate(cur_rels):
		filtered_sent_cur_rels = []
		for rel in sent_cur_rels:
			if rel[1] != 'SVO':
				filtered_sent_cur_rels.append(rel)
			else:
				subj = rel[0][0]
				pred = rel[0][1]
				obj = rel[0][2]
				skip = False
				if subj in stop_word_list:
					skip = True
					if cur_stop_word_count_bucket is not None:
						cur_stop_word_count_bucket[subj] += 1
				elif all_digits(subj) or (MUST_INCLUDE_CHINESE_flag and no_chinese_char(subj)):
					skip = True
					cur_number_count += 1
				if obj in stop_word_list:
					skip = True
					if cur_stop_word_count_bucket is not None:
						cur_stop_word_count_bucket[obj] += 1
				elif all_digits(obj) or (MUST_INCLUDE_CHINESE_flag and no_chinese_char(subj)):
					skip = True
					cur_number_count += 1
				if skip is False:
					filtered_sent_cur_rels.append(rel)
		filtered_cur_rels.append(filtered_sent_cur_rels)

	return filtered_cur_rels, cur_number_count


# build amendment relations for documents, each containing k sentences
def build_amendment_relations(ddp_res, fine_rels, coarse_rels, DEBUG):

	POB_flag = True
	VV_flag = True
	VCMP_flag = True
	HEAD_NEG_flag = False
	ALL_ADV_pivot_flag = False
	Discard_POB_where_ADV_rel_not_found_flag = True

	pivot_adv_list = ['与', '和', '跟', '同']  # for POB
	vcmp_dict = {}

	amend_fine_rels = []  # the amended fine-grained relations of the whole document
	amend_coarse_rels = []  # the amended coarse-grained relations of the whole document
	pruned_fine_rels = []
	pruned_coarse_rels = []
	assert len(ddp_res) == len(fine_rels)
	assert len(ddp_res) == len(coarse_rels)
	for sent_id in range(len(ddp_res)):
		# Example ddp:
		# [{'word': ['张三', '急匆匆', '地', '赶往', '机场', '。'], 'head': [4, 4, 2, 0, 4, 4], 'deprel': ['SBV', 'ADV', 'MT', 'HED', 'VOB', 'MT']}]
		r = ddp_res[sent_id]
		f = fine_rels[sent_id]
		c = coarse_rels[sent_id]
		a_f = []  # sentence-wise amended fine_grained tokens
		a_c = []  # sentence-wise amended coarse_grained tokens
		f_discard_idxs = []
		c_discard_idxs = []

		# POB
		# [{'word': ['中国', '与', '印度', '接壤'], 'head': [4, 4, 2, 0], 'deprel': ['SBV', 'ADV', 'POB', 'HED']}]
		# Idea: if we find nouns with POB relation, headed by an ADV or CMP, and with SVO head Vs as grandparents,
		# then if this SVO relations has empty object position, that means this object-of-preposition (POB) should
		# in fact be the object of this SVO relation, and we amend this.
		if POB_flag is True:
			for lbl_idx, lbl in enumerate(r['deprel']):
				if lbl == 'POB':
					obj_token = r['word'][lbl_idx]
					adv_idx = r['head'][lbl_idx] - 1
					adv_token = r['word'][adv_idx]

					# if the adverb token does not lie in the list where their POB can be linked with their parent verb:
					if adv_token not in pivot_adv_list and not ALL_ADV_pivot_flag:
						continue

					if adv_idx >= 0 and r['deprel'][adv_idx] in ['ADV',
																 'CMP']:  # if the POB-labelled token is controlled by an ADV
						verb_idx = r['head'][adv_idx] - 1
						# if there exists a relation triple heading with this verb grandparent of the POB,
						# and the relation has empty object, then put the POB in the object position
						if verb_idx >= 0:
							verb_token = r['word'][verb_idx]
							for f_rel_idx, f_rel in enumerate(f):
								if f_rel[1] != 'SVO':
									continue
								if f_rel[0][1] == verb_token and f_rel[2][1] == verb_idx and f_rel[0][2] is None:
									POB_matching_flag = False
									for f_rel_idx_ref, f_rel_ref in enumerate(f):
										if f_rel_ref[1] == 'ADV_V' and f_rel_ref[0][1] == verb_token and \
												obj_token in f_rel_ref[0][0]:
											POB_matching_flag = True
									if POB_matching_flag is False and Discard_POB_where_ADV_rel_not_found_flag:
										continue
									f_rel_new = ((f_rel[0][0], adv_token+'·'+f_rel[0][1], obj_token), f_rel[1], (f_rel[2][0], f_rel[2][1], lbl_idx))
									if DEBUG:
										print(f"POB:		Instance found (fine)!")
										print(f"POB:		{f_rel_new[0][0]}; {f_rel_new[0][1]}; {f_rel_new[0][2]}")
										print(f"POB:		{r['word']}")
									a_f.append(f_rel_new)
									f_discard_idxs.append(f_rel_idx)

							for c_rel_idx, c_rel in enumerate(c):
								if c_rel[1] != 'SVO':
									continue
								if c_rel[0][1] == verb_token and c_rel[0][2] is None:
									coarse_obj_token = None
									for c_rel_idx_ref, c_rel_ref in enumerate(c):
										if c_rel_ref[1] == 'ADV_V' and c_rel_ref[0][1] == verb_token and \
												obj_token in c_rel_ref[0][0]:
											coarse_obj_token = c_rel_ref[0][0]
									if coarse_obj_token is None:
										if Discard_POB_where_ADV_rel_not_found_flag:
											continue
										else:
											coarse_obj_token = obj_token
									c_rel_new = ((c_rel[0][0], adv_token+'·'+c_rel[0][1], coarse_obj_token), c_rel[1], (c_rel[2][0], c_rel[2][1], lbl_idx))
									if DEBUG:
										print(f"POB:		Instance found (coarse)!")
										print(f"POB:		{c_rel_new[0][0]}; {c_rel_new[0][1]}; {c_rel_new[0][2]}")
										print(f"POB:		{r['word']}")
									a_c.append(c_rel_new)
									c_discard_idxs.append(c_rel_idx)

		# VV (or other cases where (subj, pred, None), (None, pred, obj))
		# Dealing with conjuncted verbs. In seen examples, when faced with verb conjunction, they are failing to link
		# the object with the VV-inspired (subj, V) structure, and failing to link the subject to the local (V, obj)
		# structure. So if there exists a verb heading two SVO structures, one without subject, the other without object,
		# and the verb is itself bearing a VV dependency label (projected most probably from the verb bearing HEAD label),
		# then we consider the two relations as should have been merged into one.
		if VV_flag is True:
			for f_rel_idx_1, f_rel_1 in enumerate(f):
				if f_rel_1[1] != 'SVO' or r['deprel'][f_rel_1[2][1]] != 'VV':
					continue
				for f_rel_idx_2, f_rel_2 in enumerate(f):
					if f_rel_2[1] != 'SVO' or f_rel_idx_1 == f_rel_idx_2:
						continue
					# if the two 'SVO' relations have the same 'VV' labelled head (word and index), and f_rel_1's object
					# is None and f_rel_2's subject is None:
					if f_rel_1[0][1] == f_rel_2[0][1] and f_rel_1[2][1] == f_rel_2[2][1] and f_rel_1[0][2] is None and f_rel_2[0][0] is None:
						f_rel_new = ((f_rel_1[0][0], f_rel_1[0][1], f_rel_2[0][2]), 'SVO', (f_rel_1[2][0], f_rel_1[2][1], f_rel_2[2][2]))
						if DEBUG:
							print("VV:		Instance found (fine)!")
							print(f"VV:		{f_rel_new[0][0]}; {f_rel_new[0][1]}; {f_rel_new[0][2]}")
							print(f"VV:		{r['word']}")
						a_f.append(f_rel_new)
						f_discard_idxs.append(f_rel_idx_1)
						f_discard_idxs.append(f_rel_idx_2)
						break  # the paired object for this half-extracted relation f_rel_1 is found, stop iterating.

			for c_rel_idx_1, c_rel_1 in enumerate(c):
				if c_rel_1[1] != 'SVO' or r['deprel'][c_rel_1[2][1]] != 'VV':
					continue
				for c_rel_idx_2, c_rel_2 in enumerate(c):
					if c_rel_2[1] != 'SVO' or c_rel_idx_1 == c_rel_idx_2:
						continue
					# if the two 'SVO' relations have the same 'VV' labelled head (word and index), and c_rel_1's object
					# is None and c_rel_2's subject is None:
					if c_rel_1[0][1] == c_rel_2[0][1] and c_rel_1[2][1] == c_rel_2[2][1] and c_rel_1[0][2] is None and c_rel_2[0][0] is None:
						c_rel_new = ((c_rel_1[0][0], c_rel_1[0][1], c_rel_2[0][2]), 'SVO', (c_rel_1[2][0], c_rel_1[2][1], c_rel_2[2][2]))
						if DEBUG:
							print("VV:		 Instance found (coarse)!")
							print(f"VV:		{c_rel_new[0][0]}; {c_rel_new[0][1]}; {c_rel_new[0][2]}")
							print(f"VV:		{r['word']}")
						a_c.append(c_rel_new)
						c_discard_idxs.append(c_rel_idx_1)
						c_discard_idxs.append(c_rel_idx_2)
						break  # the paired object for this half-extracted relation c_rel_1 is found, stop iterating.

		# V_CMP: ((subj, pred_1, None), 'SVO'), ((pred_1, pred_2), 'V_CMP'), ((None, pred_2, obj), 'SVO')
		# f_rel_1: ((None, to, library), 'SVO', (None, 3, 4))
		# f_rel_2: ((walk, to), 'V_CMP', (2, 3));
		# f_rel_3: ((I, walk, None), 'SVO', (1, 2, None));
		# Turned into: ((I, walk to, library), 'SVO', (1, 2, 4))
		# Combine Vs with particles to create new predicates: When a SVO head is itself a complement in V_CMP relation,
		# and its parent in the V_CMP relation is involved in an SVO with None object, and it itself is involved in an
		# SVO relation with None subject, then the V_CMP-related two words should be in one predicate, and the subject
		# and object should be merged into one relation.

		if VCMP_flag is True:
			for f_rel_idx_1, f_rel_1 in enumerate(f):
				if f_rel_1[1] != 'SVO' or f_rel_1[0][0] is not None:  # f_rel_1 has to be SVO with empty subject
					continue
				for f_rel_idx_2, f_rel_2 in enumerate(f):
					# f_rel_2 must be 'V_CMP', and the predicate of f_rel_1 is the dependent of f_rel_2
					if f_rel_2[1] == 'V_CMP' and f_rel_2[0][1] == f_rel_1[0][1] and f_rel_2[2][1] == f_rel_1[2][1]:
						complement_token = f_rel_2[0][1]
						for f_rel_idx_3, f_rel_3 in enumerate(f):
							# f_rel_3 must be 'SVO', and the predicate of f_rel_3 is the head of the V_CMP in f_rel_2,
							# and f_rel_3 must have empty object.
							if f_rel_3[1] == 'SVO' and f_rel_3[0][1] == f_rel_2[0][0] and f_rel_3[2][1] == f_rel_2[2][0] and f_rel_3[0][2] is None:
								f_rel_new = ((f_rel_3[0][0], f_rel_2[0][0]+f_rel_2[0][1], f_rel_1[0][2]), 'SVO', (f_rel_3[2][0], f_rel_2[2][0], f_rel_1[2][2]))
								if DEBUG:
									print("V_CMP	Instance found (fine)!")
									print(f"V_CMP:		{f_rel_new[0][0]}; {f_rel_new[0][1]}; {f_rel_new[0][2]}")
									print(f"V_CMP:		{r['word']}")
								a_f.append(f_rel_new)
								f_discard_idxs.append(f_rel_idx_1)
								f_discard_idxs.append(f_rel_idx_3)
								if complement_token not in vcmp_dict:
									vcmp_dict[complement_token] = 1
								else:
									vcmp_dict[complement_token] += 1
								continue  # there could be multiple co-ordinated subjects
						break  # but there could be only one V_CMP structure for each CMP (since we're assuming this to be a dependecy tree, not a graph).

			for c_rel_idx_1, c_rel_1 in enumerate(c):
				if c_rel_1[1] != 'SVO' or c_rel_1[0][0] is not None:  # c_rel_1 has to be SVO with empty subject
					continue
				for c_rel_idx_2, c_rel_2 in enumerate(c):
					# c_rel_2 must be 'V_CMP', and the predicate of c_rel_1 is the dependent of c_rel_2
					if c_rel_2[1] == 'V_CMP' and c_rel_2[0][1] == c_rel_1[0][1] and c_rel_2[2][1] == c_rel_1[2][1]:
						complement_token = c_rel_2[0][1]
						for c_rel_idx_3, c_rel_3 in enumerate(c):
							# c_rel_3 must be 'SVO', and the predicate of c_rel_3 is the head of the V_CMP in c_rel_2,
							# and c_rel_3 must have empty object.
							if c_rel_3[1] == 'SVO' and c_rel_3[0][1] == c_rel_2[0][0] and c_rel_3[2][1] == c_rel_2[2][
								0] and c_rel_3[0][2] is None:
								c_rel_new = ((c_rel_3[0][0], c_rel_2[0][0] + c_rel_2[0][1], c_rel_1[0][2]), 'SVO',
											 (c_rel_3[2][0], c_rel_2[2][0], c_rel_1[2][2]))
								if DEBUG:
									print("V_CMP	Instance found (coarse)!")
									print(f"V_CMP:		{c_rel_new[0][0]}; {c_rel_new[0][1]}; {c_rel_new[0][2]}")
									print(f"V_CMP:		{r['word']}")
								a_c.append(c_rel_new)
								c_discard_idxs.append(c_rel_idx_1)
								c_discard_idxs.append(c_rel_idx_3)
								if complement_token not in vcmp_dict:
									vcmp_dict[complement_token] = 1
								else:
									vcmp_dict[complement_token] += 1
								continue  # there could be multiple co-ordinated subjects
						break  # but there could be only one V_CMP structure for each CMP (since we're assuming this to be a dependecy tree, not a graph).

		if HEAD_NEG_flag is True:
			pass

		amend_fine_rels.append(a_f)
		amend_coarse_rels.append(a_c)
		p_f = []  # pruned_f
		p_c = []  # pruned_c
		for f_rel_idx, f_rel in enumerate(f):
			if f_rel_idx not in f_discard_idxs:
				p_f.append(f_rel)
		for c_rel_idx, c_rel in enumerate(c):
			if c_rel_idx not in c_discard_idxs:
				p_c.append(c_rel)
		pruned_fine_rels.append(p_f)
		pruned_coarse_rels.append(p_c)

	return amend_fine_rels, amend_coarse_rels, pruned_fine_rels, pruned_coarse_rels, vcmp_dict


def main():

	parser = argparse.ArgumentParser()
	parser.add_argument('-i', '--data_entry_filename', default='./webhose_data_entries_no_corenlp.jsonl', type=str)
	parser.add_argument('-o', '--output_data_filename', default='./webhose_data_entries_with_parse_%d.json', type=str)
	parser.add_argument('-j', '--json_stats_filename', default='./rel_counts_%d.json')
	parser.add_argument('-s', '--slice_id', type=int)
	parser.add_argument('--total_slices', type=int, default=8)

	args = parser.parse_args()

	# initializing global registers for filterring triples with stopword/number arguments
	stop_word_list = ['有', '没有', '还有', '还', '是', '你', '我', '他', '她', '它', '他们', '她们', '它们', '带', '的', '任',
					  '这', '那', '这些', '那些', '哪', '哪些', '这个', '那个', '这里', '那里', '里', '可能']
	fine_stop_word_count_bucket = {}
	coarse_stop_word_count_bucket = {}
	for stop_word in stop_word_list:
		fine_stop_word_count_bucket[stop_word] = 0
		coarse_stop_word_count_bucket[stop_word] = 0
	fine_digit_excluded_count = 0  # the count of arguments that appear to be pure numbers in fine-grained extracted relations
	coarse_digit_excluded_count = 0  # the count of arguments that appear to be pure numbers in coarse-grained extracted relations

	vcmp_bucket = {}

	amend_fine_count = 0
	amend_coarse_count = 0

	ONLY_KEEP_SVO = True
	DEBUG = False
	MUST_INCLUDE_CHINESE_flag = True

	extract_time_sum = 0
	reformat_time_sum = 0
	translate_time_sum = 0
	filter_time_sum = 0
	amendment_time_sum = 0

	# read in the data input.
	data_entry_filename = args.data_entry_filename
	output_data_filename = args.output_data_filename % args.slice_id
	json_stats_filename = args.json_stats_filename % args.slice_id
	print(f"Working on slice number {args.slice_id}!")

	data_entries = []
	with open(data_entry_filename, 'r', encoding='utf8') as fp:
		lidx = 0
		for line in fp:
			if lidx > 0 and lidx % 10000 == 0:
				print(lidx)
			item = json.loads(line)
			data_entries.append(item)

	slice_standard_size = (len(data_entries) // args.total_slices)
	slice_start = args.slice_id * slice_standard_size
	slice_end = (args.slice_id + 1) * slice_standard_size if (args.slice_id != args.total_slices-1) else len(data_entries)

	print(f"Total entries: {len(data_entries)}; attending to entries from {slice_start} to {slice_end}!")

	ddp = ddparser.DDParser(encoding_model='transformer')
	st = time.time()

	fp = open(output_data_filename, 'w', encoding='utf8')
	fp.close()

	for idx, entry in enumerate(data_entries):
		if idx < slice_start:
			continue
		if idx >= slice_end:
			print(f"Last index (not included): {idx}!")
			break
		entry_st = time.time()
		sentences = entry['splitted_text']
		ddp_res = ddp.parse(sentences)
		fine_res = []
		coarse_res = []
		for r in ddp_res:
			fine_info = FineGrainedInfo(r)
			individual_fine_res = fine_info.parse()
			for rel in individual_fine_res:
				for a in rel[0]:
					assert a is None or len(a) == 2
			fine_res.append(individual_fine_res)

			coarse_info = CoarseGrainedInfo(r)
			individual_coarse_res = coarse_info.parse()
			for rel in individual_fine_res:
				for a in rel[0]:
					assert a is None or len(a) == 2
			coarse_res.append(individual_coarse_res)

		extracted_t = time.time()

		# reformat from (([S_id, S], [V_id, V], [O_id, O]), 'SVO') into ((S, V, O), 'SVO', (S_id, V_id, O_id))
		fine_res = reformat_cur_rels(fine_res)
		coarse_res = reformat_cur_rels(coarse_res)

		reformatted_t = time.time()

		# split n-ary relations into sets of binary relations (still keeps the original n-ary relations (DOB) in the list)
		fine_res = translate_nary_to_binaries(fine_res)
		coarse_res = translate_nary_to_binaries(coarse_res)

		translated_t = time.time()

		fine_res, fine_digit_excluded_count = filter_triples_stopwords(fine_res, stop_word_list, fine_stop_word_count_bucket,
															   fine_digit_excluded_count, MUST_INCLUDE_CHINESE_flag)
		coarse_res, coarse_digit_excluded_count = filter_triples_stopwords(coarse_res, stop_word_list,
																   			coarse_stop_word_count_bucket,
																   			coarse_digit_excluded_count,
																		   	MUST_INCLUDE_CHINESE_flag)

		filtered_t = time.time()

		amend_fine_res, amend_coarse_res, fine_res, coarse_res, cur_vcmp_dict = build_amendment_relations(ddp_res, fine_res, coarse_res, DEBUG)
		merge_dict(vcmp_bucket, cur_vcmp_dict)

		amendment_t = time.time()

		amend_fine_res, _ = filter_triples_stopwords(amend_fine_res, stop_word_list, MUST_INCLUDE_CHINESE_flag=MUST_INCLUDE_CHINESE_flag)
		amend_coarse_res, _ = filter_triples_stopwords(amend_coarse_res, stop_word_list, MUST_INCLUDE_CHINESE_flag=MUST_INCLUDE_CHINESE_flag)

		if ONLY_KEEP_SVO:
			fine_res = only_keep_svo(fine_res)
			coarse_res = only_keep_svo(coarse_res)
			amend_fine_res = only_keep_svo(amend_fine_res)
			amend_coarse_res = only_keep_svo(amend_coarse_res)

		entry['fine_rels'] = fine_res
		entry['coarse_rels'] = coarse_res
		entry['ddp_lbls'] = ddp_res
		entry['amend_fine_rels'] = amend_fine_res
		entry['amend_coarse_rels'] = amend_coarse_res

		amend_fine_count += len(amend_fine_res)
		amend_coarse_count += len(amend_coarse_res)

		with open(output_data_filename, 'a', encoding='utf8') as fp:
			out_line = json.dumps(entry, ensure_ascii=False)
			fp.write(out_line + '\n')
			fp.flush()

		extract_time_sum += (extracted_t-entry_st)
		reformat_time_sum += (reformatted_t-extracted_t)
		translate_time_sum += (translated_t-reformatted_t)
		filter_time_sum += (filtered_t-translated_t)
		amendment_time_sum += (amendment_t-filtered_t)

		if idx % 100 == 0 and idx > 0:
			ct = time.time()
			dur = ct - st
			dur_h = int(dur) / 3600
			dur_m = (int(dur) % 3600) / 60
			dur_s = int(dur) % 60
			print(idx, 'time lapsed: %d hours %d minutes %d seconds' % (dur_h, dur_m, dur_s))
			print(f"extract: {extract_time_sum}; reformat: {reformat_time_sum}; trans: {translate_time_sum}; filter: {filter_time_sum}; amend: {amendment_time_sum}.")

	print("Fine grained relations bucket of filtered-out relations according to stop words: ")
	print(fine_stop_word_count_bucket)
	print("")

	print("Coarse grained relations bucket of filtered-out relations according to stop words: ")
	print(coarse_stop_word_count_bucket)
	print("")

	print(f"Number of fine grained relations filtered out because of number arguments: {fine_digit_excluded_count}")
	print(f"Number of coarse grained relations filtered out because of number arguments: {coarse_digit_excluded_count}")

	print(f"Number of fine-grained relations additionally found: {amend_fine_count}")
	print(f"Number of coarse-grained relations additionally found: {amend_coarse_count}")

	print("Bucket of complements involved in V_CMP amendment: ")
	print(vcmp_bucket)

	stats_dict = {'fine_stop_word_count_bucket': fine_stop_word_count_bucket,
				  'coarse_stop_word_count_bucket': coarse_stop_word_count_bucket,
				  'fine_digit_excluded_count': fine_digit_excluded_count,
				  'coarse_digit_excluded_count': coarse_digit_excluded_count,
				  'amend_fine_count': amend_fine_count,
				  'amend_coarse_count': amend_coarse_count,
				  'vcmp_bucket': vcmp_bucket}
	with open(json_stats_filename, 'w', encoding='utf8') as fp:
		json.dump(stats_dict, fp, ensure_ascii=False)

	print("Finished!")


def prune_parsed_file():
	parser = argparse.ArgumentParser()
	parser.add_argument('-i', '--parsed_entries', default='./webhose_data_entries_with_parse.json', type=str)
	parser.add_argument('-o', '--pruned_entries', default='./webhose_data_entries_with_parse.json', type=str)

	args = parser.parse_args()

	stop_word_list = ['有', '没有', '还有', '还', '是', '你', '我', '他', '她', '它', '他们', '她们', '它们', '带', '的', '任',
					  '这', '那', '这些', '那些', '哪', '哪些', '这个', '那个', '这里', '那里', '里', '可能']

	data_entries = []
	with open(args.parsed_entries, 'r', encoding='utf8') as fp:
		lidx = 0
		for line in fp:
			if lidx % 1000 == 0 and lidx > 0:
				print(lidx)
			doc = json.loads(line)
			doc['fine_rels'] = filter_triples_stopwords(doc['fine_rels'], stop_word_list,
														MUST_INCLUDE_CHINESE_flag=True)
			doc['coarse_rels'] = filter_triples_stopwords(doc['coarse_rels'], stop_word_list,
														  MUST_INCLUDE_CHINESE_flag=True)
			doc['amend_fine_rels'] = filter_triples_stopwords(doc['amend_fine_rels'], stop_word_list,
															  MUST_INCLUDE_CHINESE_flag=True)
			doc['amend_coarse_rels'] = filter_triples_stopwords(doc['amend_coarse_rels'], stop_word_list,
																MUST_INCLUDE_CHINESE_flag=True)
			data_entries.append(doc)
			lidx += 1

	print("Dumping......")
	with open(args.pruned_entries, 'w', encoding='utf8') as fp:
		for doc in data_entries:
			fp.write(json.dumps(doc, ensure_ascii=False)+'\n')


if __name__ == '__main__':
	main()
	# prune_parsed_file()
