import random
import argparse
from ioFn import readTxt, saveTxt

def splitSubset(datas: list, d: int):
    dev_indexs = random.sample(range(len(datas)), d)
    train_subsets = []
    dev_subsets = []
    for (i, data) in enumerate(datas):
        if i in dev_indexs:
            dev_subsets.append(data)
        else:
            train_subsets.append(data)
    return train_subsets, dev_subsets

def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-i", help="input file", type=str
    )
    parser.add_argument(
        "-d", help="dev size", type=int
    )

    parser.add_argument(
        "--seed", type=int, default=16
    )

    args = parser.parse_args()

    random.seed(args.seed)
    datas = readTxt(args.i)
    train, dev = splitSubset(datas, args.d)
    trainfile = args.i + ".train"
    devfile = args.i + ".dev"

    saveTxt(train, trainfile)
    saveTxt(dev, devfile)

if __name__ == "__main__":
    main()

    # python3 splitTrainDev.py -i /mnt/bd/lab-wxz/clt/cc100/en.first500k.txt -d 5000
