import argparse
import json
import re
import os

from utils.tokenizer import SBDSplitor

def readJson(name):
    data = []
    with open(name, encoding="utf-8") as f:
        for line in f:
            data.append(json.loads(line))
    print('Load File %s with %d examples' % (name, len(data)))
    return data


def saveTxt(data, name):
    with open(name + '.doc', 'w') as f1, open(name + '.sum', 'w') as f2:
        for d in data:
            f1.write(d['text'] + '\n')
            f2.write(d['summary'] + '\n')
    print('Saving into File %s' % (name))

def addSplitTxt(inputfile, outputfile, segment):
    cnt = 0
    with open(inputfile) as fin, open(outputfile, 'w') as fout:
        for line in fin:
            text = line.strip()
            sents = segment.split(text)
            text = "<q>".join(sents)
            fout.write(text + '\n')
            cnt += 1
    print("Convert {} examples for {} to {}".format(cnt, inputfile, outputfile))


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-i', type=str, default="data.txt", help='original file')
    parser.add_argument('-o', type=str, default="cleardaata.txt", help='output file')
    parser.add_argument('-l', type=str, default="en", help='language')
    args = parser.parse_args()

    if not os.path.exists(args.o):
        os.mkdir(args.o)

    # for split in ['test', 'val', 'train']:
    #     data = readJson(os.path.join(args.i, "{}_{}.jsonl".format(args.l, split)))
    #     saveTxt(data, os.path.join(args.o, "{split}.{lg}".format(split=split, lg=args.l)))

    sbd = SBDSplitor(args.l)
    for split in ['test', 'dev', 'train']:
        for t in ['doc', 'sum']:
            inputfile = os.path.join(args.i, "{split}.{lg}.{type}".format(split=split, lg=args.l, type=t))
            outputfile = os.path.join(args.o, "{split}.{lg}.{type}".format(split=split, lg=args.l, type=t))
            addSplitTxt(inputfile, outputfile, sbd)
    
        