
import re
import sys
import tempfile
from model import Model

# Define OOV token
unk="unk"

# Initialize helper variables
corpus=tempfile.TemporaryFile()
whitespace=re.compile('\s+')
count = Model()

# Count word types in corpus
for line in sys.stdin:
    corpus.write(line)
    for word in whitespace.split(line):
        count[word] += 1.0


# Substitute OOV token for words that occur only once
corpus.seek(0)
for line in corpus:
    for word in whitespace.split(line):
        if count[word] > 1:
            sys.stdout.write(word)
        else:
            sys.stdout.write(unk)
        sys.stdout.write(' ')
    sys.stdout.write('\n')


# Close temporary file
corpus.close()
