# -*- coding: utf-8 -*-
import sys
import pandas as pd
import re


def read_dataset(textbook):
    full_dataset = []
    for sent, level in zip(textbook['sentence'], textbook['level']):
        if not isinstance(sent, str):
            continue
        if '零' not in level and '初' not in level:
            continue
        sent = sent.strip()
        if sent.endswith('”') and '“' not in sent:
            sent = sent[:-1]
        if '“' in sent and '”' not in sent:
            sent += '”'
        sent = re.sub(r"^[\w\W]{1,3}[:：]", "", sent)
        if len(sent) > 5:
            full_dataset.append(sent)
    return full_dataset


def write2file(file_name, split_set):
    with open(file_name, 'w') as fw:
        for line in split_set:
            fw.write(f"{line}\n")


def main(*argv):
    if not argv:
        argv = sys.argv[1:]
    assert len(argv) == 2
    input_file = argv[0]
    output_dir = argv[1]

    textbook = pd.read_excel(input_file)
    full_dataset = read_dataset(textbook)
    total_len = len(full_dataset)
    valid_num = 5000
    train_num = total_len - (valid_num * 2)
    train_set = full_dataset[:train_num]
    valid_set = full_dataset[train_num:-valid_num]
    test_set = full_dataset[-valid_num:]

    write2file(f"{output_dir}/train.txt", train_set)
    write2file(f"{output_dir}/valid.txt", valid_set)
    write2file(f"{output_dir}/test.txt", test_set)
    return 0


if __name__ == '__main__':
    sys.exit(main())
