import numpy as np
from scipy import stats
from transformers import AutoTokenizer
import pickle

def get_p_value(arrA, arrB):

    a = np.array(arrA)
    b = np.array(arrB)

    t, p = stats.ttest_ind(a,b, equal_var=False)

    print(t)
    return p

if __name__ == "__main__":
    vocab = pickle.load(open("./link_prediction_vocab.pkl", "rb"))
    roberta_tok = AutoTokenizer.from_pretrained("roberta-base")
    bert_tok = AutoTokenizer.from_pretrained("bert-base-uncased")
    roberta_cnt = 0
    bert_cnt = 0
    for i, token in enumerate(vocab):
        print("{}|{}".format(i+1, len(vocab)))
        if token in bert_tok.vocab:
            bert_cnt += 1
        if token in roberta_tok.vocab:
            roberta_cnt += 1
        elif "Ġ"+token in roberta_tok.vocab:
            roberta_cnt += 1
    print("Proportion of tokens in vocab that also reside in BERT: {}".format(bert_cnt / len(vocab)))
    print("Proportion of tokens in vocab that also reside in RoBERTa: {}".format(roberta_cnt / len(vocab)))
    exit()
    p = get_p_value([61.5,62.0,61.9, 61.8, 61.8], [62,62,60, 61.3, 61.3])
    print(p)