import os
import argparse
import utils
import nltk
from nltk import ConcordanceIndex
from nltk.tokenize import wordpunct_tokenize
from tqdm import tqdm
from utils import *
from easydict import EasyDict as edict
import easydict
from nltk.tokenize import RegexpTokenizer
import random
from datasets import load_dataset
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
import multiprocessing
import concurrent.futures
import numpy as np
import re
import fasttext.util
from scipy.spatial.distance import cosine
import scipy
import joblib
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression

config = get_config()

def load_sense_embeddings(lang, syn_PCA=False):
    if lang != 'en':
        embed_file = os.path.join(config.directories.ARES_pretrained_sense_embeds, "ares_bert_base_multilingual.txt")
    else:
        """For now we use the same multilingual one"""
        embed_file = os.path.join(config.directories.ARES_pretrained_sense_embeds, "ares_bert_base_multilingual.txt")
    sense_embeds = {}
    file1 = open(embed_file, 'r')
    Lines = file1.readlines()
    print("Loading synset embeddings for ARES...")
    for index, line in tqdm(enumerate(Lines[1:])):  # first line gives total vector count and dimension of sense vectors
        if index < 1000000000000000000:
            line = line.replace("\n", "") # remove the newline character
            # line = line[:-1]  # remove the newline character
            pieces = line.split(' ')
            sense = pieces[0]
            """They are BabelNet embeddings"""
            final_sense_form = sense
            embed = np.asarray([float(x) for x in pieces[1:]])
            sense_embeds[final_sense_form] = embed
    if syn_PCA:
        from LSIM import get_PCA
        print('Getting PCA on synset embeddings...')
        sense_embeds = get_PCA(sense_embeds)
        print('Done.')
    print("Done.")
    return sense_embeds

def load_BabelNet_synsets_for_language(lang):
    word2syn_file = os.path.join(config.directories.wordsyns, lang.upper() + ".json")
    with open(word2syn_file) as json_file:
        word2syn = json.load(json_file)
    return word2syn

def get_word_embedding(word, lang, sense_embeddings, BabelNetSyns=None):
    if word in BabelNetSyns:
        synsets = BabelNetSyns[word]
    else:
        synsets = None
    if synsets != None:
        """Now that we have the synsets, we just add all the corresponding sense embeddings"""
        embed = []
        for syn in synsets:
            name = syn
            if name in sense_embeddings:
                embed.append(sense_embeddings[name])
        if len(embed) > 0:
            embed = np.asarray(embed)
            embed = np.mean(embed, axis=0)
        else:
            embed = None
    else:
        embed = None
    return embed

