import sys

import Levenshtein

from data import Task1File

def main():
    ref = Task1File(sys.argv[1])

    lemmas = {src for src,_,_ in ref.data}
    forms = {trg for _,trg,_ in ref.data} - {None}
    print('Loaded %d lemmas, %d forms from reference' % (
        len(lemmas), len(forms)),
        file=sys.stderr, flush=True)

    candidates = forms | lemmas

    distances = []
    with open(sys.argv[2], 'r', encoding='utf-8') as f:
        max_n = int(next(f).split('\t')[0])
        n_limit = max(2, int(max_n / 1000000))
        print('Frequency limit: %d' % n_limit, file=sys.stderr, flush=True)
        for i, line in enumerate(f):
            if i < 1000: continue
            fields = line.rstrip('\n').split('\t')
            if len(fields) != 2: continue
            n = int(fields[0])
            if n < n_limit: break
            word = fields[1]
            if i % 10000 == 0:
                print('.', end='', file=sys.stderr, flush=True)
            #if i > 10000: break
            distances.append(
                    (min(Levenshtein.distance(candidate, word)
                         for candidate in candidates
                         if candidate != word),
                     word))

    print('done!', file=sys.stderr, flush=True)
    distances.sort()
    for d, word in distances[:10000]:
        print(word)

if __name__ == '__main__': main()

