import numpy as np
from collections import defaultdict

def unigram_linear_corel(filename):
    f = open(filename)
    coeff_file = open('evals/coeff.csv')

    coeff_dict = {}
    for line in coeff_file.readlines():
        word, score = tuple(line.strip().split('\t'))
        coeff_dict[word] = float(score)
    out, truth = [], []
    out_by_word, truth_by_word = defaultdict(list), {}

    valid, total = 0, 0
    for lidx, line in enumerate(f.readlines()):
        #if lidx == 70: break
        if not line.strip():continue
        l = line.lower().strip().split('\t')
        for entry in l:
            items = entry.strip().split(' ')
            if len(items) > 2:
                continue
            score = float(items[1])
            word = items[0]
            if word in coeff_dict:
                coeff = coeff_dict[word]
                out.append(score)
                truth.append(coeff)

                out_by_word[word].append(score)
                truth_by_word[word] = coeff

                valid += 1
            #else:
            #    print(word)
            total += 1
    p = np.corrcoef(out, truth)
    print('%d/%d' % (valid, total))
    print(p[1,0])

    a, b = [], []
    for k in out_by_word:
        a.append(np.mean(out_by_word[k]))
        b.append(truth_by_word[k])
    p2 = np.corrcoef(a, b)
    print(p2[1,0])

    # sort out_by_word
    l = [(k, np.mean(v)) for (k,v) in out_by_word.items()]
    l.sort(key=lambda x: -x[1])

    wf = open(filename + '.csv', 'w')
    wf.writelines(['{},{}\n'.format(x[0], x[1]) for x in l])
    wf.close()

if __name__ == '__main__':
    unigram_linear_corel('evals/word_importance/reg.txt')
    unigram_linear_corel('evals/word_importance/vanilla.txt')
