import os
import pandas as pd

from utility.models_utility import model_by
from defiNNet.DefiNNet import DefiNNet
from scipy.stats import spearmanr
import numpy as np

from utility.similarity_evaluator import SimilarityEvaluator


class Data:
    def __init__(self):
        self.X = []
        self.y = []


def tasks(type="all"):
    tasks = []
    path = "data/benchmarks"
    for f in os.listdir(path):
        df = pd.read_csv(os.path.join(path, f))
        data = Data()
        w1s = df.word1.values
        w2s = df.word2.values
        sims = df.similarity.values
        if hasattr(df, "POS_word1") and hasattr(df, "POS_word2"):
            data.POS_word1 = df.POS_word1.values
            data.POS_word2 = df.POS_word2.values

        for i in range(0, len(w1s)):
            data.X.append([str(w1s[i]), str(w2s[i])])
            data.y.append(float(sims[i]))

        tasks.append((f.split('.')[0], data))

    return tasks


def words_in_tasks(type="all") -> dict:
    # returns a dictionary containing all the words used in the tasks of a given type
    words = {}
    for name, data in tasks(type):
        for i in range(0, len(data.X)):
            w = data.X[i][0]
            if w not in words:
                words[w] = 0
            words[w] += 1
    return words


def all_words_in_tasks(types: list = None) -> dict:
    # Given a list of benchmark types returns all the words present in them
    if types is None:
        return words_in_tasks(type="all")

    raise NotImplementedError("Not Implemented yet")


def predict_according_to(test_model, model_name, dataX, posX0):
    # Given dataX (pair of words to compare) return the embedding of both according to a given model
    # All pairs are taken into account since the OOV embedding is set to the embedding of "entity"
    if model_name == 'w2v':
        w1 = test_model.get_vector_and_word(dataX[0])
        w2 = test_model.get_vector_and_word(dataX[1])
        return w1, w2

    # Additive, Head and DefiNNet predict according their rule the first embedding
    # If pos is known in the benchmark data is used
    # The second embedding is retrieved from W2V pretrained embedding space and "entity" is used for OOV
    e1, w1 = test_model.predict_and_word(dataX[0], posX0)
    e2, w2 = test_model.preprocessor.get_vector_and_word(dataX[1])
    return (e1, w1), (e2, w2)


def perform_tasks(pos_list=None, model_mappings=None):
    if model_mappings is None:
        pretained_model = 'data/pretrained_embeddings/GoogleNews-vectors-negative300.bin'

        model_mappings = {
            'additive': ['additive_model', pretained_model],
            'head': ['head_model', pretained_model],
            'definnet': ['definnet', pretained_model],
            'w2v': ['w2v', pretained_model],
        }

    if not os.path.exists('data/benchmark_results'):
        os.mkdir('data/benchmark_results')

    spearman = {}

    for model_name in model_mappings:
        spearman[model_name] = {}
        test_model = model_by(model_mappings[model_name])

        for typ in ['all']:
            spearman[model_name][typ] = {}
            beresult_out = os.path.join('data/benchmark_results', typ)
            if not os.path.exists(beresult_out):
                os.mkdir(beresult_out)

            for name, data in tasks(type=typ):
                spearman[model_name][typ][name] = []

                beresult_dataset_out = os.path.join(beresult_out, name)
                if not os.path.exists(beresult_dataset_out):
                    os.mkdir(beresult_dataset_out)

                cosines = []
                true = []
                evaluator = SimilarityEvaluator('cosine_similarity')
                output_df = pd.DataFrame([])

                w1_computed = []
                w2_computed = []
                for i in range(0, len(data.X)):
                    if hasattr(data, "POS_word1"):
                        pos = data.POS_word1[i].lower()
                        if pos_list is not None and pos not in pos_list:
                            continue
                    else:
                        pos = None
                    (e1, w1), (e2, w2) = predict_according_to(test_model, model_name, data.X[i], pos)

                    w1_computed.append(w1)
                    w2_computed.append(w2)

                    pred = evaluator.similarity_function(e1, e2)
                    if type(test_model) is DefiNNet:
                        cosines.append(- pred[0])
                    else:
                        cosines.append(- pred)
                    true.append(data.y[i])

                s = spearmanr(cosines, true)
                print('\t'.join([model_name, typ, name, str(s.correlation), str(len(cosines))]))
                spearman[model_name][typ][name] = s.correlation

                X = np.array(data.X)
                output_df["word1"] = X[:, 0]
                output_df["word2"] = X[:, 1]
                output_df["w1_computed"] = w1_computed
                output_df["w2_computed"] = w2_computed

                if hasattr(data, "POS_word1") and hasattr(data, "POS_word2"):
                    output_df["POS_word1"] = data.POS_word1
                    output_df["POS_word2"] = data.POS_word2

                output_df["similarity"] = true
                output_df["cosine"] = cosines

                output_df.to_csv(os.path.join(beresult_dataset_out, model_name+"_"+name + ".csv"))

    return spearman


