from __future__ import print_function
from __future__ import division

import sys
import argparse
from codecs import open
from os.path import exists

def init_vocab(vocab_file):
    if not exists(vocab_file):
        raise ValueError('    Vocab file {} not found'.format(vocab_file))

    vocab = {}
    idx = 0
    with open(vocab_file, 'r', 'utf-8') as f:
        for line in f:
            word = line.strip()

            if word:
                word = word.split()[0]
                vocab[word] = idx
                idx += 1

    ivocab = {i: w for w, i in vocab.items()}
    return vocab, ivocab

if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--file', type=str, required=True,
                        help='File which rare words are replaced with unk')
    parser.add_argument('--vocab', type=str, required=True,
                        help='Vocab file')
    args = parser.parse_args()

    data_file = args.file
    vocab_file = args.vocab

    vocab, _ = init_vocab(vocab_file)
    out_file = data_file + '.rare2unk'
    open(out_file, 'w').close()

    with open(out_file, 'w', 'utf-8') as fout, open(data_file, 'r', 'utf-8') as fin:
        for line in fin:
            if not line.strip():
                continue

            words = line.strip().split()
            words = [w if w in vocab else u'_UNK' for w in words]
            new_line = u' '.join(words) + '\n'
            fout.write(new_line)
