from sklearn import manifold
import argparse
import numpy as np
import math
from scipy import stats

__filter_by_words_to_load__ = True #To save on memory usage, specify if to only load the words of interest (learning and test words) from the GloVe space
__tasks_top_dir__="." #Directory where the tasks directories exist
__glove_top_dir__="path to GloVe directory" #Directory where the GloVe pre-trained embedding spaces directories and files exist
##########################################################################
def main():
    """
    The main entry point of the program
    """

    #1. Specify pre-trained embedding space. e.g. [6, 300] means the GloVe space over the 6B corpus and 300 dimensional vectors
    original_embedding_space=[42,300]

    #2. Compute the original space performance on the similarity tasks, to form the baseline
    compute_baseline(original_embedding_space)

    #3. Evaluate the performance of the re-embedding approach.
    #Prepare the ranges of the parameters to experiment with
    manifold_dimensionality_range=param_range(300, 300, 1) #Manifold dimensionality should be equal to the original embedding space dimensionality
    num_local_neighbours_range=param_range(1000, 1000, 1) #The number of local neighbours can bet set to values above the dimensionality of the manifold
    window_start_range = param_range(5000, 15000, 200) #Effective starts for the fitting window are in the range [5000 to 15000]
    window_length_range = param_range(1001, 1001, 1) #The window length should be set to the number of the local neighbours+1

    #Run the experiment(s)
    test_re_embedding(original_embedding_space,manifold_dimensionality_range,num_local_neighbours_range, window_start_range, window_length_range)

###########################################################################

def test_re_embedding(original_embedding_space,manifold_dimensionality_range,num_local_neighbours_range, window_start_range, window_length_range):
    """
    Evaluate the performance of the manifold-based embedding approach using the word similarity tasks
    :param original_embedding_space: the pre-trained GloVe space to be re-embedded
    :param manifold_dimensionality_range: the range of manifold dimensionalities to experiment with
    :param num_local_neighbours_range: the range of local neighbours (input to the manifold learning) to experiment with
    :param window_start_range: the range of window starts (start of the sample to learn the manifold) to experiment with
    :param window_length_range: the range of window length to experiment with (the length is the number of words in the window for manifold learning)
    """

    print("=========================================")
    print("TEST ON RE-EMBEDDING ",str(original_embedding_space[0])+"B",str(original_embedding_space[1])+"d")

    #The word similarity tasks represented by their ground truth files
    tasks=[
        #WS353 similarity task (Set1 of the ground truth)
        ['/wordsim353/set1.tab',1, 'WS353Sim',""], #the second argument says if there is a header of not in the file

        #RG65 similarity task
        ['/rg-65/rg-65.tsv',0, 'RG65',""]
    ]

    #For each parameters configuration, a score is obtained for each task
    #The average performance represents the average for the input range of parameters
    avg_performance={}
    for task in tasks:
        avg_performance[task[2]]=[0,0]

    #Start preparing the words to load from the GloVe file, to avoid loading unnecessary words into memory
    words_to_load = set()

    #Test words are specified to be loaded
    for task in tasks:
        words_to_load = words_to_load.union(select_words_to_load(__tasks_top_dir__+ task[0]))

    #Load the ground truth for the word similarity tasks (i.e. human judgement values for each pair of words)
    for task in tasks:
        with open(__tasks_top_dir__+ task[0], 'r') as f:
            original_ground_truth = [(x.rstrip().split('\t')[0].lower(), x.rstrip().split('\t')[1].lower(), float(x.rstrip().split('\t')[2].lower()))
                                     for x in list(f.readlines())[task[1]:]]#header or not
            task[3]=original_ground_truth

    #Select the overall words which would be used to fit the manifold for the various options of parametrs ranges
    vocab, ivocab= load_vocab_from_file(original_embedding_space[0],original_embedding_space[1])
    overall_ids_for_fitting=[i for i in range(window_start_range[0], window_start_range[-1]+window_length_range[-1]+1)]
    overall_words_for_fitting_map= {idx: ivocab[idx] for idx in overall_ids_for_fitting}
    overall_words_for_fitting={ivocab[i] for i in overall_ids_for_fitting}

    #The overall words which would be used to fit the manifold are set for loading into memory
    words_to_load=words_to_load.union(overall_words_for_fitting)

    #Load the vocabulary and embedding space from the files into memory, and get the overall min and max vectors for each feature
    vocab, ivocab, W, min_vector, max_vector = load_from_file(words_to_load,original_embedding_space[0],original_embedding_space[1])

    #Normalization to unit norm
    d = (np.sum(W ** 2, 1) ** (0.5))
    W = (W.T / d).T
    V=len(W[0])

    print("-----------------------------------------")
    print("Detailed performance")
    print("manifold_dimensionality", "num_local_neighbours", "window_start", "window_length", sep='\t', end='')
    for task in tasks:
        print('\t'+task[2], end='')
    print()

    #For each configuration of manifold dimensionality, number of local neighbours, window start, and window length, learn the manifold and evaluate the re-embedding
    for manifold_dimensionality in manifold_dimensionality_range:
        for num_local_neighbours in num_local_neighbours_range:
            for window_start in window_start_range:
                for window_length in window_length_range:
                    #Identify the words of the window to be used for fitting the manifold
                    ids={i for i in range(window_length)}
                    words_to_fit_for = {overall_words_for_fitting_map[i+window_start] for i in ids}

                    #Select the vectors from the original embedding which correspond to the words of the window
                    W_To_Fit = np.zeros((len(words_to_fit_for), V))
                    i = 0
                    for w in words_to_fit_for:
                        if (w not in vocab): continue
                        W_To_Fit[i, :] = W[vocab[w]]
                        i = i + 1

                    #Fit the manifold using the Locally Linear Emedding algorithm
                    lle =manifold.LocallyLinearEmbedding(num_local_neighbours, manifold_dimensionality)
                    lle.fit(W_To_Fit)

                    print(manifold_dimensionality, num_local_neighbours, window_start, window_length, sep='\t', end='')

                    #Evaluate on the word similarity tasks
                    for task in tasks:
                        scored_pairs = []
                        updated_ground_truth = []
                        original_ground_truth=task[3]
                        #For each word pair in the word similarity task, get the vectors, and re-embed them before computing the cosine similarity
                        for gtpair in original_ground_truth:
                            if (gtpair[0] not in vocab) or (gtpair[1] not in vocab):
                                if (gtpair[0] not in vocab):
                                    print(gtpair[0], 'not in vocab')
                                if (gtpair[1] not in vocab):
                                    print(gtpair[1], 'not in vocab')
                                continue
                            else:
                                #Re-embed each test vector using the learned manifold before computing the cosine similarity
                                v1 = W[vocab[gtpair[0]]]
                                v1=lle.transform(v1.reshape(1, -1))[0]
                                v2 = W[vocab[gtpair[1]]]
                                v2 = lle.transform(v2.reshape(1, -1))[0]
                                # Here the dot product is divided on the |x|*|y| to get the cosine
                                d1 = (np.sum(v1 ** 2, ) ** (0.5))
                                d2 = (np.sum(v2 ** 2, ) ** (0.5))
                                sim = np.dot(v1, v2.T) / (d1 * d2)

                            if math.isnan(sim):
                                print("A NaN is found: {0}, {1}".format(gtpair[0], gtpair[1]))

                            scored_pairs.append((gtpair[0], gtpair[1], sim))
                            updated_ground_truth.append((gtpair[0], gtpair[1], gtpair[2]))

                        #Caclculate the Spearman Rank Correlation between the re-embedding results, and the ground truth
                        xx = [x[2] for x in scored_pairs]
                        yy = [y[2] for y in updated_ground_truth]
                        spearman_correlation = stats.spearmanr(xx, yy)[0]
                        print('\t' + str(spearman_correlation), end='')

                        #Add the results of this configuration experiment to the average which is printed in the end
                        avg_performance[task[2]][0] = avg_performance[task[2]][0] + spearman_correlation #sums of scores
                        avg_performance[task[2]][1]=avg_performance[task[2]][1]+1 #counts of scores
                    print()

    print("-----------------------------------------")
    #Print the average performance of the re-embedding approach on all the conifigurations of parameters
    for task in tasks:
        avg=avg_performance[task[2]][0]/avg_performance[task[2]][1]
        print(task[2], "Avg:", avg)
    print("=========================================")



def compute_baseline(original_embedding_space):
    """
    Test the performance of the original embedding space on each similarity task, to set the baseline
    :param original_embedding_space: the original GloVe space
    """
    print("=========================================")
    print("BASLINE- TEST ON ORIGINAL EMBEDDING SPACE ",str(original_embedding_space[0])+"B",str(original_embedding_space[1])+"d")
    tasks=[
        ['/wordsim353/set1.tab',1, 'WS353Sim',""], #the second argument says if there is a header of not
        ['/rg-65/rg-65.tsv',0, 'RG65',""]
    ]

    #############

    #Start preparing the words to load from the GloVe file, to avoid loading unnecessary words into memory
    words_to_load = set()

    #Test words are specified to be loaded
    for task in tasks:
        words_to_load = words_to_load.union(select_words_to_load(__tasks_top_dir__+ task[0]))

    #Load the vocabulary and embedding space from the files into memory, and get the overall min and max vectors for each feature
    vocab, ivocab, W, min_vector, max_vector = load_from_file(words_to_load,original_embedding_space[0],original_embedding_space[1])

    # Normalize at the feature level by x'=(2*(x-min)/(max-min))-1
    W = (2 * (W - min_vector) / (max_vector - min_vector)) - 1

    # Normalize vectors to unit norms
    d = (np.sum(W ** 2, 1) ** (0.5))
    W = (W.T / d).T

    #Load the ground truth for the word similarity tasks (i.e. human judgement values for each pair of words)
    for task in tasks:
        with open(__tasks_top_dir__+ task[0], 'r') as f:
            original_ground_truth = [(x.rstrip().split('\t')[0].lower(), x.rstrip().split('\t')[1].lower(), float(x.rstrip().split('\t')[2].lower()))
                                     for x in list(f.readlines())[task[1]:]]#header or not
            task[3]=original_ground_truth

    #For each task, evaluate the original embedding to get the baseline.
    for task in tasks:
        scored_pairs = []
        updated_ground_truth = []
        original_ground_truth=task[3]
        for gtpair in original_ground_truth:
            # For each pairs of words in the word similarity task, compute the cosine similarity
            if (gtpair[0] not in vocab) or (gtpair[1] not in vocab):
                if (gtpair[0] not in vocab):
                    print(gtpair[0], 'not in vocab')
                if (gtpair[1] not in vocab):
                    print(gtpair[1], 'not in vocab')
                continue
            else:
                v1 = W[vocab[gtpair[0]]]
                v2 = W[vocab[gtpair[1]]]

                d1 = (np.sum(v1 ** 2, ) ** (0.5))
                d2 = (np.sum(v2 ** 2, ) ** (0.5))
                sim = np.dot(v1, v2.T) / (d1 * d2)

            if math.isnan(sim):
                print("A NaN is found: {0}, {1}".format(gtpair[0], gtpair[1]))

            scored_pairs.append((gtpair[0], gtpair[1], sim))
            updated_ground_truth.append((gtpair[0], gtpair[1], gtpair[2]))

        # Caclculate the Spearman Rank Correlation between the results, and the ground truth to find the baseline
        xx = [x[2] for x in scored_pairs]
        yy = [y[2] for y in updated_ground_truth]

        spearman_correlation = stats.spearmanr(xx, yy)
        print(task[2],"Spearman Rank Correlation:", spearman_correlation[0])
    print("=========================================")


def select_words_to_load(filepath):
    """
    Load the test words from the task file
    :param filepath: of the word similarity task file
    :return:
    """

    words_to_load = set()

    # Original ground truth has all the word pairs.
    with open(filepath, 'r', encoding="utf8") as f:
        original_ground_truth = [
            (x.rstrip().split('\t')[0].lower(), x.rstrip().split('\t')[1].lower(), x.rstrip().split('\t')[2].lower())
            for x in list(f.readlines())]

    #Add each word from each pair to the set
    for gtpair in original_ground_truth:
        words_to_load.add(gtpair[0])
        words_to_load.add(gtpair[1])

    return words_to_load

def load_from_file(words_to_load, corpus_size, num_of_dimensions):
    """
    Load the GloVe space from the file into memory.
    :param words_to_load: the words to load  their vectors into memory, to save on memory
    :param corpus_size: the original GloVe embedding corpus size such as 6B or 42B
    :param num_of_dimensions: the dimensionality of the original space. such as 50, 100, 200, or 300
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-f')
    parser.add_argument('--vocab_file',
                        default=__glove_top_dir__+ '/glove.{0}B/glove.{0}B.{1}d.txt'.format(corpus_size, num_of_dimensions), type=str)
    parser.add_argument('--vectors_file',
                        default=__glove_top_dir__+'/glove.{0}B/glove.{0}B.{1}d.txt'.format(corpus_size, num_of_dimensions), type=str)
    args = parser.parse_args()

    size_of_the_vocab_in_file = 0

    #index=0
    with open(args.vocab_file, 'r', encoding="utf8") as f:
        words = []
        for line in f:
            word = line.rstrip().split(' ')[0]
            size_of_the_vocab_in_file = size_of_the_vocab_in_file + 1
            if not __filter_by_words_to_load__:
                words.append(word)
            elif word in words_to_load:
                words.append(word)


    vocab_size = len(words)

    #A dictionary of word to index
    vocab = {w: idx for idx, w in enumerate(words)}

    #A dictionary of index to word
    ivocab = {idx: w for idx, w in enumerate(words)}

    # Free the memeory from the words as vocab has been created
    del words

    vector_dim=num_of_dimensions

    sufficiently_large_number=1000
    #Compute the min and max for each feature while loading
    min_vector = sufficiently_large_number* np.ones(num_of_dimensions)
    max_vector = -sufficiently_large_number* np.ones(num_of_dimensions)

    #Fill in array and calculate the means vector
    index=0
    with open(args.vectors_file, 'r', encoding="utf8") as f:
        W = np.zeros((vocab_size, vector_dim))
        for line in f:
            vals = line.rstrip().split(' ')
            vector=list(map(float, vals[1:]))
            min_vector=np.minimum(min_vector, vector)
            max_vector = np.maximum(max_vector, vector)
            if __filter_by_words_to_load__ and vals[0] not in words_to_load: continue

            W[index, :] = vector

            index=index+1

    return vocab, ivocab, W, min_vector, max_vector

def load_vocab_from_file(corpus_size, num_of_dimensions):
    """
    Load the vocabulary of a GloVe embedding
    :param corpus_size: the original GloVe embedding corpus size such as 6B or 42B
    :param num_of_dimensions: the dimensionality of the original space. such as 50, 100, 200, or 300
    """
    parser = argparse.ArgumentParser()
    parser.add_argument('-f')
    parser.add_argument('--vocab_file',
                        default=__glove_top_dir__+'/glove.{0}B/glove.{0}B.{1}d.txt'.format(corpus_size, num_of_dimensions), type=str)
    parser.add_argument('--vectors_file',
                        default=__glove_top_dir__+'/glove.{0}B/glove.{0}B.{1}d.txt'.format(corpus_size, num_of_dimensions), type=str)
    args = parser.parse_args()

    size_of_the_vocab_in_file = 0

    with open(args.vocab_file, 'r', encoding="utf8") as f:
        words = []
        for line in f:
            word = line.rstrip().split(' ')[0]
            size_of_the_vocab_in_file = size_of_the_vocab_in_file + 1
            words.append(word)

    vocab_size = len(words)

    #A dictionary of word to index
    vocab = {w: idx for idx, w in enumerate(words)}

    #A dictionary of index to word
    ivocab = {idx: w for idx, w in enumerate(words)}

    # Free the memeory from the words as vocab has been created
    del words

    return vocab, ivocab

#A range function which includes the start and end in the range
param_range = lambda start, end, step: list(np.arange(start, end + 1, step))

#If run as main, invoke the main() method at the top
if __name__ == "__main__":
    main()
