import sys
import gzip
import os.path

from collections import Counter

from nltk import word_tokenize

def get_wordlist(f):
    counts = Counter()
    for line in f:
        if len(line) > 20 and line[0] != '<':
            tokens = word_tokenize(line)
            for w_tm1, w_t in zip(tokens, tokens[1:]):
                if w_tm1.isalpha() and w_t.isalpha():
                    counts[w_t] += 1
    return counts


def main():
    for filename in sys.argv[1:]:
        if filename.endswith('.gz'):
            with gzip.open(filename, 'rt', encoding='utf-8') as f:
                counts = get_wordlist(f)
        else:
            with open(filename, 'r', encoding='utf-8') as f:
                counts = get_wordlist(f)
        name = os.path.basename(filename).split('.')[0]
        outfile = os.path.join(os.path.dirname(filename), name+'.words')
        with open(outfile, 'w', encoding='utf-8') as f:
            for word, n in sorted(
                    counts.items(), key=lambda t: (-t[1], t[0])):
                print('%d\t%s' % (n, word), file=f)


if __name__ == '__main__': main()

