import re
import argparse
import numpy as np
from nltk.tokenize import sent_tokenize
from rouge import Rouge

def sentSplitFn(s: str):
    pattern = r"[.?!]"
    return re.split(pattern, s)

def readTxt(fname):
    data = []
    with open(fname, 'rb') as fin:
        for line in fin:
            data.append(line.decode('utf-8').strip())
    print("Reading {} example from {}".format(len(data), fname))
    return data

def saveTxt(data, fname):
    with open(fname, 'w') as fout:
        for d in data:
            fout.write('{}\n'.format(d))
    print('Save {} example to {}'.format(len(data), fname))

def checkSentNum(args):
    datas = readTxt(args.i)
    sent_num = 0
    for data in datas:
        sents = sentSplitFn(data)
        sent_num += len(sents)
    sent_num /= len(datas)
    print("average number of sents: {:.2f}".format(sent_num))


def checkLengthRatio(args):
    sources = readTxt(args.s)
    targets = readTxt(args.t)

    src_lengths = [len(src.split()) for src in sources]
    tgt_lengths = [len(tgt.split()) for tgt in targets]
    ratios = [t / s for (s, t) in zip(src_lengths, tgt_lengths)]

    print("average ratio: {:.3f}".format(np.average(ratios)))
    print("the standard deviation of ratio: {:.3f}".format(np.std(ratios)))

    print("average source lengths: {:.3f}".format(np.average(src_lengths)))
    print("the standard deviation of source: {:.3f}".format(np.std(tgt_lengths)))

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-l', help='lang', type=str, default='zh')
    parser.add_argument('-i', help='input', type=str, default='input.txt')
    parser.add_argument('-s', help='input', type=str, default='input.txt')
    parser.add_argument('-t', help='input', type=str, default='input.txt')
    # parser.add_argument('-d', help="sentence delimitor", type=str, default="<q>")
    parser.add_argument('-m', help='mode', type=str)
    args = parser.parse_args()

    eval("{}(args)".format(args.m))