# word_analogy.py
# This script handles evaluating a genetic embedding on the word analogy task outlined by Mikolov et. al.


# Internal Imports

# External Imports
import json

# Globals
chunk_size = 16


def main(embedding_path="", dim=0, log_output = False):
    top_k = 10
    # Start by loading the configuration
    with open("src/genetic_embedding/core/config.json") as config_file:
        config = json.load(config_file)["evaluation"]["qualitative"]

    # Import the chromosome
    #print("PREPARATION: Importing candidate chromosome")
    dimension = int(config["dimension"])
    if dim:
        dimension = dim
    embedding = []
    print("EMBEDDING: Importing chromosome")
    if embedding_path:
        embedding = import_chromosome(embedding_path, dimension)
    else:
        embedding = import_chromosome(config["embedding_path"], dimension)
    print("EMBEDDING: Chromosome loaded")

    # Get the word from the user
    print("Input a vocabulary word to inspect")
    word = input("> ")

    # If the word is in the vocabulary, process it further
    if (word in embedding.keys()):
        # Build a ranked list of vocab words sorted by similarity
        sim_words = sorted(list(embedding.keys()), reverse=True, key=lambda vocab_word: similarity(embedding[word], embedding[vocab_word], dimension))#[1:]

        # For the top K entries, print them out
        print("Similarity Ranking for \"" + word + "\"")
        for i in range(top_k):
            print(str(i+1) + ":\t" + sim_words[i])
    else:
        print(word + " is not in the embedding provided. Please try another word")




def import_chromosome(path, dim):
    # Open the path to the chromosome
    embedding = {}
    dimension = dim
    num_inconsistent = 0
    with open(path) as chromosome_file:
        for line in chromosome_file:
            line = line.strip().split("\t")
            embedding[line[0]] = line[1]
            if not len(line[1]) == dimension:
                #print(line[1], len(line[1]))
                num_inconsistent += 1
    if num_inconsistent > 0:
        print("WARNING: {}/{} embeddings deviate from the standard {} bits".format(num_inconsistent, len(embedding), dimension))
    return embedding


def similarity(embed1, embed2, dimension):
    similarity = 0
    for i in range(int(dimension / chunk_size)):
        binary = bin(int(embed1[i * chunk_size:(i+1) * chunk_size], 2) ^ int(embed2[i * chunk_size:(i+1) * chunk_size], 2))
        similarity += binary[2:].rjust(chunk_size, "0").count("0")
    return similarity
    #binary = bin(int(embed1, 2) ^ int(embed2, 2))[2:]
    #return binary.rjust(dimension, "0").count("0")


def distance(embed1, embed2, dimension):
    similarity = 0
    for i in range(int(dimension / chunk_size)):
        binary = bin(int(embed1[i * chunk_size:(i+1) * chunk_size], 2) ^ int(embed2[i * chunk_size:(i+1) * chunk_size], 2))
        similarity += binary[2:].rjust(chunk_size, "0").count("1")
    return similarity
    #binary = bin(int(embed1, 2) ^ int(embed2, 2))[2:]
    #return binary.rjust(dimension, "0").count("1")


def xor(embed1, embed2):
    result = ""
    for i in range(int(len(embed1)/chunk_size)):
        binary = bin(int(embed1[i * chunk_size:(i+1) * chunk_size], 2) ^ int(embed2[i * chunk_size:(i+1) * chunk_size], 2))
        result += binary[2:].rjust(chunk_size, "0")
    return result


def bitwise_or(embed1, embed2):
    result = ""
    for i in range(int(len(embed1)/chunk_size)):
        binary = bin(int(embed1[i * chunk_size:(i+1) * chunk_size], 2) | int(embed2[i * chunk_size:(i+1) * chunk_size], 2))
        result += binary[2:].rjust(chunk_size, "0")
    return result


if __name__ == "__main__":
    main()