from __future__ import print_function, division

import sys
from collections import Counter
from itertools import izip
from codecs import open

if __name__ == '__main__':
    file_1 = sys.argv[1]
    file_2 = sys.argv[2]
    length = int(sys.argv[3])
    min_freq = int(sys.argv[4])

    src_tok_count = 0
    trg_tok_count = 0
    num_sent = 0
    src_vocab = Counter()
    trg_vocab = Counter()

    with open(file_1, 'r', 'utf-8') as src_f, open(file_2, 'r', 'utf-8') as trg_f:
        for line1, line2 in izip(src_f, trg_f):
            if line1.strip() and line2.strip():
                src_toks = line1.strip().split()
                trg_toks = line2.strip().split()

                if len(src_toks) <= length and len(trg_toks) <= length:
                    num_sent += 1
                    src_tok_count += len(src_toks)
                    trg_tok_count += len(trg_toks)
                    src_vocab.update(src_toks)
                    trg_vocab.update(trg_toks)

    print('There are {} lines length <= {}'.format(num_sent, length))
    print('Size: src {:,}, trg {:,}'.format(src_tok_count, trg_tok_count))
    print('Unique toks, src: {}, trg: {}'.format(len(src_vocab), len(trg_vocab)))
    min_freq_src_tok_count = sum([1 for k, v in src_vocab.items() if v >= min_freq])
    min_freq_trg_tok_count = sum([1 for k, v in trg_vocab.items() if v >= min_freq])
    print('Number of toks appear >= {}, src: {}, trg: {}'.format(min_freq, min_freq_src_tok_count, min_freq_trg_tok_count))