# coding:utf-8
import json
from nltk.stem import WordNetLemmatizer
import string
import re
import random
import torch
from tqdm import tqdm
import opencc
from transformers import XLMRobertaTokenizer, XLMRobertaModel, AdamW
import OpenHowNet
import thulac
import numpy as np

# src_data_path = 'data_r/babel_data.json'
# src_data = json.load(open(src_data_path))
# dst_data = []
# dst_data_lenght = []

# hownet_dict = OpenHowNet.HowNetDict()

# cc = opencc.OpenCC('t2s')
# wnl = WordNetLemmatizer()
# seg_thulac = thulac.thulac(seg_only=True, T2S=True)
# tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# sememe list
# with open('./tlm_data/sememe_all.txt', 'r', encoding='utf-8') as f:
#     sememe_str = f.read()
# f.close() 
# sememe_list = sememe_str.split(' ')

def gen_data_1():
    dst_data = []
    dst_data_lenght = []
    for k in tqdm(src_data):
        
        dst_data_instance = {}
        dst_data_length_instance = {}

        dst_data_instance['s'] = [ss.split('|')[1] for ss in src_data[k]['sememes']]
        dst_data_length_instance['s_l'] = len(dst_data_instance['s'])

        if 'word_en' in src_data[k].keys() and 'definition_en' in src_data[k].keys():
            dst_data_instance['w_e'] = src_data[k]['word_en']
            if len(src_data[k]['definition_en']) != 0:
                def_list = [wnl.lemmatize(def_en).lower().split(' ') for def_en in src_data[k]['definition_en']]
                dst_data_instance['d_e'] = def_list
                dst_data_length_instance['d_e_l'] = len(def_list)
        
        if 'word_cn' in src_data[k].keys() and 'definition_cn' in src_data[k].keys():
            dst_data_instance['w_c'] = cc.convert(src_data[k]['word_cn'])
            if len(src_data[k]['definition_cn']) != 0:
                def_list = [seg_thulac.cut(def_cn, text=True).split(' ') for def_cn in src_data[k]['definition_cn']]
                dst_data_instance['d_c'] = def_list
                dst_data_length_instance['d_c_l'] = len(def_list)
        
        if 'word_fr' in src_data[k].keys() and 'definition_fr' in src_data[k].keys():
            dst_data_instance['w_f'] = src_data[k]['word_fr']
            if len(src_data[k]['definition_fr']) != 0:
                def_list = [def_fr.lower().split(' ') for def_fr in src_data[k]['definition_fr']]
                dst_data_instance['d_f'] = def_list
                dst_data_length_instance['d_f_l'] = len(def_list)
        
        dst_data.append(dst_data_instance)
        dst_data_lenght.append(dst_data_length_instance)
    with open('./final_data/data.json', 'w', encoding='utf-8') as f:
        json.dump(dst_data, f, ensure_ascii=False) 
    f.close()
    with open('./final_data/data_length.json', 'w', encoding='utf-8') as f:
        json.dump(dst_data_lenght, f, ensure_ascii=False) 
    f.close()


def gen_data_2():
    with open('./tlm_data/sememe_all.txt', 'r', encoding='utf-8') as f:
        sememe_str = f.read()
    f.close()
    sememe_list = sememe_str.split(' ')
    data = json.load(open('./final_data/data.json'))
    data_length = json.load(open('./final_data/data_length.json'))
    final_data = []
    for instance in tqdm(data):
        final_data_instance = {}
        final_data_instance['s_i'] = [sememe_list.index(ss) for ss in instance['s']]
        
        final_data_instance['w_e_ids'] = []
        final_data_instance['w_c_ids'] = []
        final_data_instance['w_f_ids'] = []

        final_data_instance['w_e_s'] = []
        final_data_instance['w_c_s'] = []
        final_data_instance['w_f_s'] = []
        if 'w_e' in instance.keys():
            word_ids = tokenizer(instance['w_e'])['input_ids']
            for i in range(1,len(word_ids)-1):
                final_data_instance['w_e_ids'].append(word_ids[i])
            ids_sememe = hownet_dict.get_sememes_by_word(instance['w_e'],structured=False,lang="zh",merge=True)
            if ids_sememe:
                if isinstance(ids_sememe, dict):
                    ids_sememe = list(list(ids_sememe.items())[0][1])
                elif isinstance(ids_sememe, set):
                    ids_sememe = list(ids_sememe)
                temp = []
                for s in ids_sememe:
                    if s in sememe_list:
                        temp.append(sememe_list.index(s))
                if temp:
                    final_data_instance['w_e_s'] = temp
        if 'w_c' in instance.keys():
            word_ids = tokenizer(instance['w_c'])['input_ids']
            for i in range(1,len(word_ids)-1):
                final_data_instance['w_c_ids'].append(word_ids[i])
            ids_sememe = hownet_dict.get_sememes_by_word(instance['w_c'],structured=False,lang="zh",merge=True)
            if ids_sememe:
                if isinstance(ids_sememe, dict):
                    ids_sememe = list(list(ids_sememe.items())[0][1])
                elif isinstance(ids_sememe, set):
                    ids_sememe = list(ids_sememe)
                temp = []
                for s in ids_sememe:
                    if s in sememe_list:
                        temp.append(sememe_list.index(s))
                if temp:
                    final_data_instance['w_c_s'] = temp
        if 'w_f' in instance.keys():
            word_ids = tokenizer(instance['w_f'])['input_ids']
            for i in range(1,len(word_ids)-1):
                final_data_instance['w_f_ids'].append(word_ids[i])
            ids_sememe = hownet_dict.get_sememes_by_word(instance['w_f'],structured=False,lang="zh",merge=True)
            if ids_sememe:
                if isinstance(ids_sememe, dict):
                    ids_sememe = list(list(ids_sememe.items())[0][1])
                elif isinstance(ids_sememe, set):
                    ids_sememe = list(ids_sememe)
                temp = []
                for s in ids_sememe:
                    if s in sememe_list:
                        temp.append(sememe_list.index(s))
                if temp:
                    final_data_instance['w_f_s'] = temp

        final_data_instance['d_e_ids'] = []
        final_data_instance['d_c_ids'] = []
        final_data_instance['d_f_ids'] = []

        final_data_instance['d_e_i2s'] = []
        final_data_instance['d_c_i2s'] = []
        final_data_instance['d_f_i2s'] = []

        if 'd_e' in instance.keys():
            for def_en in instance['d_e']:
                idx = 0
                def_ids = []
                def_i2s = []
                for w in def_en:
                    idx_list = []
                    word_ids = tokenizer(w)['input_ids']
                    for i in range(1,len(word_ids)-1):
                        idx += 1
                        idx_list.append(idx)
                        def_ids.append(word_ids[i])
                    ids_sememe = hownet_dict.get_sememes_by_word(w,structured=False,lang="zh",merge=True)
                    if ids_sememe:
                        if isinstance(ids_sememe, dict):
                            ids_sememe = list(list(ids_sememe.items())[0][1])
                        elif isinstance(ids_sememe, set):
                            ids_sememe = list(ids_sememe)
                        temp = []
                        for s in ids_sememe:
                            if s in sememe_list:
                                temp.append(sememe_list.index(s))
                        if temp:
                            def_i2s.append([idx_list, temp])
                final_data_instance['d_e_ids'].append(def_ids)
                final_data_instance['d_e_i2s'].append(def_i2s)
        
        if 'd_c' in instance.keys():
            for def_cn in instance['d_c']:
                idx = 0
                def_ids = []
                def_i2s = []
                for w in def_cn:
                    idx_list = []
                    word_ids = tokenizer(w)['input_ids']
                    for i in range(1,len(word_ids)-1):
                        idx += 1
                        idx_list.append(idx)
                        def_ids.append(word_ids[i])
                    ids_sememe = hownet_dict.get_sememes_by_word(w,structured=False,lang="zh",merge=True)
                    if ids_sememe:
                        if isinstance(ids_sememe, dict):
                            ids_sememe = list(list(ids_sememe.items())[0][1])
                        elif isinstance(ids_sememe, set):
                            ids_sememe = list(ids_sememe)
                        temp = []
                        for s in ids_sememe:
                            if s in sememe_list:
                                temp.append(sememe_list.index(s))
                        if temp:
                            def_i2s.append([idx_list, temp])
                final_data_instance['d_c_ids'].append(def_ids)
                final_data_instance['d_c_i2s'].append(def_i2s)
        
        if 'd_f' in instance.keys():
            for def_fn in instance['d_f']:
                idx = 0
                def_ids = []
                def_i2s = []
                for w in def_fn:
                    idx_list = []
                    word_ids = tokenizer(w)['input_ids']
                    for i in range(1,len(word_ids)-1):
                        idx += 1
                        idx_list.append(idx)
                        def_ids.append(word_ids[i])
                    ids_sememe = hownet_dict.get_sememes_by_word(w,structured=False,lang="zh",merge=True)
                    if ids_sememe:
                        if isinstance(ids_sememe, dict):
                            ids_sememe = list(list(ids_sememe.items())[0][1])
                        elif isinstance(ids_sememe, set):
                            ids_sememe = list(ids_sememe)
                        temp = []
                        for s in ids_sememe:
                            if s in sememe_list:
                                temp.append(sememe_list.index(s))
                        if temp:
                            def_i2s.append([idx_list, temp])
                final_data_instance['d_f_ids'].append(def_ids)
                final_data_instance['d_f_i2s'].append(def_i2s)
        
        final_data.append(final_data_instance)
    fout = open('./final_data/final_data.json', 'w', encoding = 'utf-8')
    json.dump(tlm_data, fout, ensure_ascii=False)


def get_length():
    length = json.load(open('./final_data/data_length.json'))
    data = json.load(open('./final_data/final_data.json'))
    assert(len(length) == len(data))
    for i in range(len(length)):
        assert(len(data[i]['s_i']) == length[i]['s_l'])
        length[i]['d_e_ids_l'] = [len(d) for d in data[i]['d_e_ids']]
        length[i]['d_c_ids_l'] = [len(d) for d in data[i]['d_c_ids']]
        length[i]['d_f_ids_l'] = [len(d) for d in data[i]['d_f_ids']]
    fout = open('./final_data/final_data_length.json', 'w', encoding = 'utf-8')
    json.dump(length, fout, ensure_ascii=False)

def get_length2():
    length = json.load(open('./final_data/final_data_length.json'))
    def_length = []
    for i in length:
        l = 0
        if len(i['d_e_ids_l']) > 0:
            l+=i['d_e_ids_l'][0]
        if len(i['d_c_ids_l']) > 0:
            l+=i['d_c_ids_l'][0]
        if len(i['d_f_ids_l']) > 0:
            l+=i['d_f_ids_l'][0]
        def_length.append(l)
    assert(len(def_length) == 15756)
    l4 = np.percentile(def_length, (25, 50, 75), interpolation='midpoint')
    print(l4)
    l2 = sum(def_length)/len(def_length)
    print(l2)
    count = 0
    for l in def_length:
        if l > 512:
            count += 1
    print(count/len(def_length))
if __name__ == "__main__":
    get_length2()

            

    