import json
import os
import copy
import re
import time
import argparse
from StanfordCoreNLP import *

root_dir = '/Users/teddy/Files/Potential Corpus/WebHoses_Chinese_News_Articles/630_webhose-2016-10_20170904084325'
filename_list = os.listdir(root_dir)

delimiters = ['\n', '。', '！', '？', '；', '：', ':', ' ', '……', ';', '（', '）']
data_entries = []
excluded_entries = []
global_filtered_sentence_count = 0
global_filtered_entry_count = 0
global_total_sentence_count = 0
global_longest_sentence_len = 0
global_outlengthed_count = 0
FILTER_LEN = 4

english_or_digits = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't',
					 'u', 'v', 'w', 'x', 'y', 'z', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D',
					 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
					 'Y', 'Z']


def split_long_sentences(string, interval):
	global global_filtered_sentence_count
	res_list = []
	offset = 0
	while offset < len(string):
		if offset+interval >= len(string):
			res_list.append(string[offset:])
			offset += interval
			break
		closest = -1
		for i in range(interval, 0, -1):
			if offset+i >= len(string):
				continue
			c = string[offset+i]
			if c in ['：', '，', '；', '.', '、']:
				closest = offset+i
				res_list.append(string[offset:closest])
				offset = closest+1
				break
		if closest == -1 and offset+interval < len(string):
			closest = offset+interval
			res_list.append(string[offset:closest])
			offset = closest

	final_res_list = []
	for item in res_list:
		chinese_character_list = re.findall(r'[\u4e00-\u9fff]+', item)
		chinese_character_list = ''.join(chinese_character_list)
		if len(chinese_character_list) > FILTER_LEN:
			final_res_list.append(item)
		else:
			global_filtered_sentence_count += 1
	return final_res_list


def split_str(splitted):
	global global_filtered_sentence_count
	global global_longest_sentence_len
	global global_outlengthed_count
	global global_total_sentence_count
	new_splitted = []

	'''
	for item in splitted:
		splits = []
		for idx, c in enumerate(item):
			if idx == len(item)-1 or idx == 0:
				continue
			if c == '.' and item[idx+1] not in english_or_digits and item[idx-1] not in english_or_digits:
				print(item[idx+1])
				print(item[idx-1])
				splits.append(idx)
		offset = 0
		if len(splits) == 0:
			new_splitted.append(item)
		else:
			for sp in splits:
				new_splitted.append(item[offset:sp])
				offset = sp + 1
			new_splitted.append(item[offset:])
			continue

	splitted = new_splitted
	new_splitted = []
	'''

	for d in delimiters:
		for s in splitted:
			new_splitted += s.split(d)
		splitted = copy.deepcopy(new_splitted)
		new_splitted = []

	for s in splitted:
		chinese_character_list = re.findall(r'[\u4e00-\u9fff]+', s)
		chinese_character_list = ''.join(chinese_character_list)
		if FILTER_LEN < len(chinese_character_list) <= 430 and len(s) < 500:
			new_splitted.append(s)
		elif len(chinese_character_list) > 430 or len(s) >= 500:
			print("Accounting for overlengthed sentences!")
			further_slitted_s = split_long_sentences(s, 430)
			global_outlengthed_count += 1
			new_splitted += further_slitted_s
		else:
			global_filtered_sentence_count += 1

	splitted = new_splitted
	new_splitted = []

	for item in splitted:
		new_splitted.append(item.strip())

	for s in new_splitted:
		if len(s) > global_longest_sentence_len:
			global_longest_sentence_len = len(s)

	global_total_sentence_count += len(new_splitted)
	return new_splitted


parser = argparse.ArgumentParser()
parser.add_argument('--output', type=str, default='.//webhose_data_entries_no_corenlp.jsonl')
parser.add_argument('--corenlp', type=int, default=0, help='whether or not to use corenlp for sentence splitting.')
parser.add_argument('--excluded', type=str, default='webhose_data_entries_no_corenlp_excluded.jsonl')

args = parser.parse_args()

StanfordCoreNLP_chinese_properties = get_StanfordCoreNLP_chinese_properties()
with CoreNLPClient(
			annotators=['tokenize', 'ssplit'],
			properties=StanfordCoreNLP_chinese_properties,
			threads=16,
			memory='24G',
			be_quiet=True) as client:
	st = time.time()
	for idx, filename in enumerate(filename_list):
		if idx % 1000 == 0 and idx > 0:
			ct = time.time()
			dur = ct - st
			dur_h = int(dur) / 3600
			dur_m = (int(dur) % 3600) / 60
			dur_s = int(dur) % 60
			print(idx, 'time lapsed: %d hours %d minutes %d seconds' % (dur_h, dur_m, dur_s))
			print("Current global_longest_sentence_len: ", global_longest_sentence_len)
		with open(os.path.join(root_dir, filename), 'r', encoding='utf8') as fp:
			data_entry = json.load(fp)
		text = data_entry['text']

		splitted = []
		if args.corenlp > 0:
			ann = client.annotate(text)
			for sentence in ann.sentence:
				sent_str = ''
				# join([token.word for token in sentence.token])
				for tok_id, token in enumerate(sentence.token):
					if tok_id > 0 and sentence.token[tok_id-1].endChar != token.beginChar:
						num_spaces = token.beginChar - sentence.token[tok_id-1].endChar
						assert num_spaces > 0
						for i in range(num_spaces):
							sent_str += ' '
					sent_str += token.word

				splitted.append(sent_str)
		else:
			splitted = [text]

		splitted_text = split_str(splitted)
		if len(splitted_text) == 0:
			global_filtered_entry_count += 1
			excluded_entries.append(data_entry)
			continue
		data_entry['splitted_text'] = splitted_text
		data_entries.append(data_entry)

print("Total number of sentences filtered out via length criterion: ", global_filtered_sentence_count)
print("Total number of entries filtered out due to no lengthly enough sentences: ", global_filtered_entry_count)
print("Total number of entries exceeding length limit and have to be splitted in finer granularity: ", global_outlengthed_count)
print("global_longest_sentence_len: ", global_longest_sentence_len)
print("global_total_sentence_count: ", global_total_sentence_count)

with open(args.output, 'w', encoding='utf8') as fp:
	for entry in data_entries:
		out_line = json.dumps(entry, ensure_ascii=False)
		fp.write(out_line+'\n')
		fp.flush()

with open(args.excluded, 'w', encoding='utf8') as fp:
	for entry in excluded_entries:
		out_line = json.dumps(entry, ensure_ascii=False)
		fp.write(out_line+'\n')
		fp.flush()

print("Saved!")
