

def main():

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("discosuite")
    parser.add_argument("tigertrain")
    parser.add_argument("outputtiger")
    parser.add_argument("outputgen")
    parser.add_argument("outputsuite")
    args = parser.parse_args()

    discosuite = []
    with open(args.discosuite, encoding = "utf8") as f:
        f.readline()
        for line in f:
            data = line.strip().split(",")
            data[1] = int(data[1])
            discosuite.append(data)

    discosuite = sorted(discosuite, key = lambda x: x[1])
    print(f"Len discosuite {len(discosuite)}")
    ids = set([ data[1] for data in discosuite ])
    print(f"Number sentences {len(ids)}")
    maps = {iid: j+1 for j, iid in enumerate(sorted(ids))}
    
    # Rewrite discosuite with alternative header + sorted by sentence id
    with open(args.outputsuite, "w", encoding="utf8") as f:
        f.write(f"sentence_id,type,yield,cat,other cat,note\n")
        for line in discosuite:
            f.write(f"{maps[line[1]]},{line[0]},{line[2]},{line[3]},{line[4]},{line[5]}\n")

    with open(args.tigertrain, encoding="utf8") as f:
        sentences = [line for line in f]

    with open(args.outputtiger, "w", encoding="utf8") as train:
        with open(args.outputgen, "w", encoding="utf8") as gen:
            
            for i, sentence in enumerate(sentences):
                if i+1 in ids:
                    gen.write(sentence)
                else:
                    train.write(sentence)

main()
