import argparse
from transformers import BertTokenizer


def main(args):
    tokenizer = BertTokenizer.from_pretrained(args.bert_model_path)
    with open(args.input, 'rt') as fpr, \
            open(args.output, 'wt') as fwp:
        count = 0
        remove_count = 0
        for line in fpr:
            line = line.strip()
            if not line:
                continue
            count += 1
            domain, intent, tokens_slots = line.split('\t')
            tokens = ' '.join([token_slot.split('|')[0] for token_slot in tokens_slots.split()])
            tokenized_text = tokenizer.tokenize(tokens)
            token_ids = tokenizer.convert_tokens_to_ids(tokenized_text)
            if len(token_ids) <= 40:
                fwp.write(line + '\n')
            else:
                remove_count += 1
        print("{} lines removed out of {} lines.".format(remove_count, count))


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description='Remove long utterances from a TSV file.')
    parser.add_argument('input', type=str, help='Input TSV files')
    parser.add_argument('output', type=str, help='Output TSV files')
    parser.add_argument('bert_model_path', type=str, help='Name of path of the BERT tokenizer')
    args = parser.parse_args()
    main(args)
