import os
import collections

work_dir = "/mnt/yardcephfs/mmyard/g_wxg_td_prc/chulunzhou/fairseq-master_v7"
data_dir = "data-bin/wmt19_ch_en"

lang = "en"
dicts = collections.OrderedDict()
with open(os.path.join(work_dir, data_dir, "dict.{}.txt".format(lang)), 'r', encoding='utf-8') as drf:
    lines = drf.readlines()
    num = 0
    for line in lines:
        num += 1
        token = line.rstrip()
        if token in dicts.keys():
            print("Attention, duplicate token {} (line: {}), please input: ".format(token, num))
        else:
            dicts[token] = 0

num_excluded_tokens = 0
with open(os.path.join(work_dir, data_dir, "train.ch-en.{}".format(lang)), 'r', encoding='utf-8') as trf:
    training_lines = trf.readlines()
    for line in training_lines:
        tokens = line.rstrip().split(' ')
        for token in tokens:
            if token in dicts.keys():
                dicts[token] += 1
            else:
                num_excluded_tokens += 1

print("Number of exluded tokens: ", num_excluded_tokens)

with open(os.path.join(work_dir, data_dir, "dict.{}.txt.count".format(lang)), 'w', encoding='utf-8') as dwf:
    for token, count in dicts.items():
        dwf.write(token+' '+str(count)+'\n')
