from __future__ import print_function
from __future__ import division

import os
import sys
import argparse
from os.path import exists
from collections import Counter
from codecs import open

def get_types(data_file):
    vocab = Counter()
    with open(data_file, 'r', 'utf-8') as f:
        for line in f:
            if line.strip():
                words = line.strip().split()
                vocab.update(words)

    return vocab.keys()

def init_vocab(vocab_file):
    if not exists(vocab_file):
        raise ValueError('    Vocab file {} not found'.format(vocab_file))

    vocab = {}
    idx = 0
    with open(vocab_file, 'r', 'utf-8') as f:
        for line in f:
            word = line.strip()
            if word:
                word = word.split()[0]
                vocab[word] = idx
                idx += 1

    ivocab = {i: w for w, i in vocab.items()}
    return vocab, ivocab


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--pf', required=True,
                        help='Parent data file')
    parser.add_argument('--pv', required=True,
                        help='Parent vocab file')
    parser.add_argument('--cf', required=True,
                        help='Child data file')
    parser.add_argument('--cv', required=True,
                        help='Child vocab file')
    args = parser.parse_args()

    parent_vocab, _ = init_vocab(args.pv)
    child_vocab, _ = init_vocab(args.cv)

    parent_types = get_types(args.pf)
    parent_types = [x for x in parent_types if x in parent_vocab]
    child_types = get_types(args.cf)
    child_types = [x for x in child_types if x in child_vocab]

    child_in_parent = [x for x in child_types if x in parent_types]
    print('Amount of child types that appear in parent: {:.2f}%'.format(100 * len(child_in_parent) / len(child_types)))

