import os
import argparse
import utils
import nltk
from nltk import ConcordanceIndex
from nltk.tokenize import wordpunct_tokenize
from tqdm import tqdm
from utils import *
from easydict import EasyDict as edict
import easydict
from nltk.tokenize import RegexpTokenizer
import random
from datasets import load_dataset
from tokenizers import normalizers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
import multiprocessing
import concurrent.futures
import numpy as np
import re
import fasttext.util
from scipy.spatial.distance import cosine
import scipy
import joblib
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
import math
import matplotlib.pyplot as plt



def main(args):
    """We use 'sum' option from now on, it's the best."""
    languages = args.languages
    languages = languages.split("_")
    results = {}
    config = utils.get_config()

    pos_distribution = {}
    babelnet_pos = {}
    for lang in languages:
        babelnet_pos = {}
        wordsyns_stuff = utils.load(os.path.join(config.directories.wordsyns, lang.upper() + ".json"))
        for word, sense_list in tqdm(wordsyns_stuff.items()):
            for sense in sense_list:
                pos_type = sense[-1]
                if pos_type in babelnet_pos:
                    babelnet_pos[pos_type].append(sense)
                else:
                    babelnet_pos[pos_type] = [sense]
        for key, list_ in babelnet_pos.items():
            babelnet_pos[key] = list(set(list_))
        pos_distribution[lang] = babelnet_pos
    stop = None




if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Arguments to evaluate on LSIM task')
    parser.add_argument('--eval_word_type', type=str, default='LSIM')
    parser.add_argument('--languages', type=str, default='ar_en_es_fi_fr_he_pl_ru_zh')  # ar_en_es_fi_fr_he_pl_ru_zh
    parser.add_argument('--embed_type', type=str, default='cross_colex_sum')  # binary and sum much better than pairwise product!!!
    parser.add_argument('--rank_method', type=str, default='average')
    parser.add_argument('--results_save_path', type=str, default='')
    parser.add_argument('--method_word_intersections', type=str, default='cross_colex_sum~fasttext~BERT')  # only words that have valid embeddings for all these methods
    parser.add_argument('--PCA', type=str2bool, default=True)
    parser.add_argument('--use_gpu', type=str2bool, default=True)
    args = parser.parse_args()
    main(args)


