import argparse
import pyarrow

parser = argparse.ArgumentParser()
parser.add_argument("--input_path", type=str, required=True, help="Path to input dataset in model's format")
parser.add_argument("--output_path", type=str, required=True, help="Path to output file")

args = parser.parse_args()

with open(args.input_path, 'r') as f:
    text = f.read()

label_tokens = [
    "?QUESTIONMARK",
    ".PERIOD",
    ",COMMA",
    "!EXCLAMATIONMARK",
    "-DASH",
    ":COLON",
    ";SEMICOLON"
]

all_tokens = text.split()
input_tokens = []
labels = []

for token in all_tokens:
    if token.startswith("_PUNC_"):
        labels[-1] == token
    else:
        input_tokens.append(token)
        labels.append("O")

with open(args.output_path, 'w') as f:
    f.write(" ".join(input_tokens))