#!/usr/bin/env python

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

import sys
from io import open
from argparse import ArgumentParser
import sentencepiece as spm

Py3 = sys.version_info[0] == 3

parser = ArgumentParser(description='SentencePiece Train')
parser.add_argument('--output_format', type=str)
parser.add_argument('--model', type=str)
parser.add_argument('--infile', type=str)
parser.add_argument('--outfile', type=str)
args = parser.parse_args()

sp = spm.SentencePieceProcessor()
sp.Load("{}".format(args.model))

if args.output_format == 'piece':
    func = sp.EncodeAsPieces
else:
    func = sp.EncodeAsIds

with open(args.infile, encoding='utf-8') as infile, \
    open(args.outfile, 'w+', encoding='utf-8') as outfile:
    for line in infile.readlines():
        line = line.strip()
        if Py3:
            encoded = map(str, func(line))
        else:
            encoded = map(unicode, func(line))
        outfile.write('{}\n'.format(' '.join(encoded)))
