import re
import argparse
# from nltk.tokenize import sent_tokenize
from pysbd import Segmenter
from sacremoses import MosesTokenizer
from multiprocessing import Pool

# nltk_langtag = {
#     'de': 'german',
#     'es': 'spanish',
#     'en': 'english'
# }

# def splitChineseSentence(sentence):
#     resentencesp = re.compile('([﹒﹔﹖﹗．；。！？]["’”」』]{0,2}|：(?=["‘“「『]{1,2}|$))')
#     s = sentence
#     slist = []
#     for i in resentencesp.split(s):
#         if resentencesp.match(i) and slist:
#             slist[-1] += i
#         elif i:
#             slist.append(i)
#     return slist

# def sentSplitFn(paragraph: str, lang='en', return_str=False, delimitor=None):
#     if delimitor is not None:
#         sents = paragraph.split(delimitor)
#     else:
#         if lang == 'zh':
#             sents = splitChineseSentence(paragraph)
#         else:
#             langtag = nltk_langtag[lang]
#             sents = sent_tokenize(paragraph, langtag)
#     return " ".join(sents) if return_str else sents

def readTxt(fname):
    data = []
    with open(fname, 'rb') as fin:
        for line in fin:
            data.append(line.decode('utf-8').strip())
    print("Reading {} example from {}".format(len(data), fname))
    return data

def saveTxt(data, fname):
    with open(fname, 'w') as fout:
        for d in data:
            fout.write('{}\n'.format(d))
    print('Save {} example to {}'.format(len(data), fname))

def sentSplitFn(paragraph: str, lang='en', return_str=False, delimitor=None):
    if delimitor is not None:
        sents = paragraph.split(delimitor)
    else:
        segmenter = Segmenter(language=lang)
        sents = segmenter.segment(paragraph)
    return " ".join(sents) if return_str else sents

def mergeSent(args):
    datas = readTxt(args.i)
    results = []
    delimitor = args.d
    with Pool(args.t) as p:
    # for data in datas:
        sents_group = p.starmap_async(sentSplitFn, [(data, args.l, False) for data in datas])
        sents_group = sents_group.get()
    results = [delimitor.join(sents) for sents in sents_group]
    saveTxt(results, args.o)

def mergeDoc(args):
    datas = readTxt(args.i)
    delimitor = args.d
    documents = []
    paragraphs = []
    for data in datas:
        data = data.strip()
        if data == "":
            documents.append(delimitor.join(paragraphs))
            paragraphs = []
        else:
            paragraphs.append(data)
    
    if len(paragraphs) != 0:
        documents.append(delimitor.join(paragraphs))
    
    saveTxt(documents, args.o)

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', help='lang', type=str, default='zh')
    parser.add_argument('-i', help='input', type=str)
    parser.add_argument('-o', help='output', type=str)
    parser.add_argument('-d', help='sentence delimitor', type=str, default='<q>')
    parser.add_argument('-m', help='mode', type=str)
    parser.add_argument('-t', help='number of process', type=int, default=40)
    args = parser.parse_args()

    eval("{}(args)".format(args.m))
