import os
import argparse
import utils
import nltk
from nltk import ConcordanceIndex
from nltk.tokenize import wordpunct_tokenize
from tqdm import tqdm
from utils import *
from easydict import EasyDict as edict
import easydict
from nltk.tokenize import RegexpTokenizer
import random
from datasets import load_dataset
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
import multiprocessing
import concurrent.futures
import numpy as np
import re
import fasttext.util
from scipy.spatial.distance import cosine
import scipy
import joblib
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

config = get_config()

def load_sense_embeddings(lang):
    if lang != 'en':
        embed_file = os.path.join(config.directories.SensEmBERT_pretrained_sense_embeds,
                                  "sensembert_" + lang.upper() + "_kb.txt")
    else:
        # embed_file = os.path.join(config.directories.SensEmBERT_pretrained_sense_embeds,
        #                           "sensembert_" + lang.upper() + "_supervised.txt")
        embed_file = os.path.join(config.directories.SensEmBERT_pretrained_sense_embeds,
                                  "sensembert_" + lang.upper() + "_kb.txt")
    sense_embeds = {}
    file1 = open(embed_file, 'r')
    Lines = file1.readlines()
    print("Loading sense embeddings for SensEmBERT...")
    for index, line in tqdm(enumerate(Lines[1:])):  # first line gives total vector count and dimension of sense vectors
        if index < 1000000000000000000:
            line = line.replace("\n", "") # remove the newline character
            # line = line[:-1]  # remove the newline character
            pieces = line.split(' ')
            sense = pieces[0]
            if lang == 'en':
                sense = sense.split(":")[0]
                word_part = sense.split("%")[0]
                number_part = sense.split("%")[1]
                if int(number_part) < 10:
                    number_part = "0" + number_part
                final_sense_form = word_part + ".n." + number_part  # They mention in paper they are ALL nouns
            else:
                """They are BabelNet embeddings"""
                final_sense_form = sense.split("%")[1]
            embed = np.asarray([float(x) for x in pieces[1:]])
            sense_embeds[final_sense_form] = embed
    print("Done.")
    return sense_embeds

def load_BabelNet_synsets_for_language(lang):
    word2syn_file = os.path.join(config.directories.wordsyns, lang.upper() + ".json")
    with open(word2syn_file) as json_file:
        word2syn = json.load(json_file)
    return word2syn

def get_word_embedding(word, lang, sense_embeddings, BabelNetSyns=None):
    """We need to use the correct WordNet given the language. It looks like the English senses are WordNet senses
       that I need to manipulate to match WordNet's input. The other languages look like BabelNet senses that also
       need some slight parsing."""
    wordnet_lang_dict = {'en': 'en', 'es': 'spa', 'fr': 'fra'}
    wordnet_lang = wordnet_lang_dict[lang]
    if lang == 'en':
        synsets = wn.synsets(word)
    else:
        if word in BabelNetSyns:
            synsets = BabelNetSyns[word]
        else:
            synsets = None
    if synsets != None:
        """Now that we have the synsets, we just add all the corresponding sense embeddings"""
        embed = []
        for syn in synsets:
            if lang == 'en':
                name = syn.name()
            else:
                name = syn
            if name in sense_embeddings:
                embed.append(sense_embeddings[name])
        if len(embed) > 0:
            embed = np.asarray(embed)
            embed = np.mean(embed, axis=0)
        else:
            embed = None
    else:
        embed = None
    return embed

