import gzip, re
import sys
from collections import Counter


lang = ["ar", "bg", "ca", "cs", "da", "de", "en", "es", "et", "fi", "fr", "he", "hi", "hr", "id", "it", "ja", "ko",
        "la", "lv", "nl", "no", "pl", "pt", "ro", "ru", "sk", "sl", "sv", "uk", "zh"]
Dir = "/home/taoji/data_from_8015/Zdep_data/data2.2_more/"


# Build UD Vocabulary
pre_vocab = {la: set() for la in lang}
for la in lang:
    tr_file = Dir + la + "_train.conllu"
    te_file = Dir + la + "_test.conllu"
    de_file = Dir + la + "_dev.conllu"
    file_list = [tr_file, te_file, de_file]
    for infile in file_list:
        with open(infile, 'r', encoding='UTF-8') as fin:
            for line in fin:
                x = line.strip().split('\t')
                if x[0].isdigit():
                    pre_vocab[la].add(x[1].casefold())
                    pre_vocab[la].add(x[2].casefold())
                

# Cut Pretrained fasttext Vocabulary                
for la in lang:
    with open(Dir+"wiki.multi."+la+".vec", 'r', encoding='UTF-8') as fp:
        w_dim = int(fp.readline().strip().split(' ')[1])
        results = []
        for i, w in enumerate(fp):
            w_list = w.split(' ')
            if w_list[0].casefold() in pre_vocab[la]:
               results.append(w)
        with open(Dir+"wiki.multi.cut."+la+".vec", 'w', encoding='UTF-8') as fout:
            fout.write(str(len(results))+" "+str(w_dim)+u"\n")
            for vocab in results:
                fout.write(vocab)
        print('%s%8d%8d%8.4f' % (la, len(results), len(pre_vocab[la]), len(results)/len(pre_vocab[la])))
        

