import pickle
import numpy as np
import xlrd
import gensim
#打开excel
from sklearn import preprocessing

import xlsxwriter as xw
#mix_CSS_radicalgraph

id2word={}#word是部首
word2id={}
id2character={}
id2radicalstring={}
index2character={}
index2id={}
id2index={}
character2index={}
index2radicalstring={}
r12r2={}

cid2id={}
index2ccharacter={}
index2cid={}




def readJia():
    word_pairdic=[]

    wb = xlrd.open_workbook('../data/20210520_oracle-radical.xlsx')
    # 按工作簿定位工作表
    sh = wb.sheet_by_name('相关部件对')
    for i in range(sh.nrows):
        li = sh.row_values(i)
        id1 = li[0]
        id2 = li[1]
        r12r2[id1 + " " + id2] = ""
        r12r2[id2 + " " + id1] = ""
        word_pairdic.append(id1+" "+id2)
        word_pairdic.append(id2 + " " + id1)
    return word_pairdic

def readJia1():
    wb = xlrd.open_workbook('../data/20210520_oracle-radical.xlsx')
    # 按工作簿定位工作表
    sh = wb.sheet_by_name('部件表')
    for i in range(sh.nrows):
        if i == 0:
            continue
        # print(sh.row_values(i))
        li = sh.row_values(i)
        id = li[0].split("_")[1]
        word = li[1]
        id2word[id]=word
        word2id[word]=id

    return word_pairdic


def readJia2(word_pairdic,word2id):

    wb = xlrd.open_workbook('../data/20210520_oracle-radical.xlsx')
    sh = wb.sheet_by_name('甲骨文-部件对应表')
    count=0
    ccount=0
    for i in range(sh.nrows):
        if i == 0:
            continue
        # print(sh.row_values(i))
        li = sh.row_values(i)
        id = li[0]
        id=id.split("_")[3]+id.split("_")[4]
        cha=li[1]

        pp=li[3]
        if "," in pp:
            pp=pp.split(",")
        else:
            pp=[pp]
        id2character[id]=cha
        id2radicalstring[id]="".join(pp)
        index2radicalstring[count]="".join(pp)
        index2character[count]=cha
        character2index[cha]=count
        index2id[count]=id
        id2index[id]=count
        count=count+1

        if id[:-1] not in cid2id.keys():
            cid2id[id[:-1]]=[id]
            index2cid[ccount] = id[:-1]
            index2ccharacter[ccount] = cha
            ccount=ccount+1
        else:
            cid2id[id[:-1]].append(id)


        #print(id +" "+pp[0])
        for p in pp:
            if p not in word2id.keys():
                continue
            p = word2id[p]
            if (id + " " + p not in word_pairdic) :#and (p + " " + id not in word_pairdic)
                word_pairdic.append(id + " " + p)
            if (p + " " + id not in word_pairdic) :#and (p + " " + id not in word_pairdic)
                word_pairdic.append(p + " " + id)
    return word_pairdic

def getsamefont(word_pairdic,id2character):
    idlist=list(id2character.keys())
    for k,id in enumerate(idlist[:-1]):
        for idd in idlist[k+1:]:
            if (id[:-1]==idd[:-1]):
                if (id + " " + idd not in word_pairdic):
                    word_pairdic.append(id + " " + idd)
                if (idd + " " + id not in word_pairdic):
                    word_pairdic.append(idd + " " + id)
    return word_pairdic


def getNumofCommonSubstr(str1, str2,relatedRedical=True):
    lstr1 = len(str1)
    lstr2 = len(str2)
    record = [[0 for i in range(lstr2 + 1)] for j in range(lstr1 + 1)]  # 多一位
    maxNum = 0  # 最长匹配长度
    p = 0  # 匹配的起始位

    for i in range(lstr1):
        for j in range(lstr2):
            try:
                r1id = word2id[str1[i]]
                r2id = word2id[str2[j]]
            except:
                continue
            if relatedRedical==True:
                if str1[i] == str2[j]:
                    # 相同则累加
                    record[i + 1][j + 1] = record[i][j] + 1
                    if record[i + 1][j + 1] > maxNum:
                        # 获取最大匹配长度
                        maxNum = record[i + 1][j + 1]
                        # 记录最大匹配长度的终止位置
                        p = i + 1
                elif (r1id + " " + r2id in r12r2.keys()) or (r2id + " " + r1id in r12r2.keys()):
                    record[i + 1][j + 1] = record[i][j] + 0.7
                    if record[i + 1][j + 1] > maxNum:
                        # 获取最大匹配长度
                        maxNum = record[i + 1][j + 1]
                        # 记录最大匹配长度的终止位置
                        p = i + 1
            else:
                if str1[i] == str2[j]:
                    # 相同则累加
                    record[i + 1][j + 1] = record[i][j] + 1
                    if record[i + 1][j + 1] > maxNum:
                        # 获取最大匹配长度
                        maxNum = record[i + 1][j + 1]
                        # 记录最大匹配长度的终止位置
                        p = i + 1



    return maxNum



def stringsimi(index2radicalstring,relatedRedical=True):
    cpair2LCSsimi={}
    cpair2LCSsimi_np=np.zeros((2912, 2912))
    idlist=list(index2radicalstring.keys())
    #print(idlist)
    for k,id in enumerate(idlist[:-1]):
        for idd in idlist[k+1:]:
            maxNum=getNumofCommonSubstr(id2radicalstring[index2id[id]], id2radicalstring[index2id[idd]],relatedRedical)
            sim=(2*maxNum)/(len(id2radicalstring[index2id[id]])+len(id2radicalstring[index2id[idd]]))
            cpair2LCSsimi[index2id[id]+"_"+index2id[idd]]=sim
            #print(id+" "+idd)
            cpair2LCSsimi_np[id][idd]=sim
            cpair2LCSsimi_np[idd][id] = sim
    return cpair2LCSsimi,cpair2LCSsimi_np



def getcidsimi(cid1,cid2,relatedRedical=True,radical=False):
    idlist1=cid2id[cid1]
    idlist2 = cid2id[cid2]
    maxsim=-1
    for id1 in idlist1:
        for id2 in idlist2:
            cfindex1 = id2index[id1]
            cfindex2 = id2index[id2]
            cfnumradicla1 = len(index2radicalstring[cfindex1])
            cfnumradical2 = len(index2radicalstring[cfindex2])
            maxNum = getNumofCommonSubstr(id2radicalstring[id1], id2radicalstring[id2],relatedRedical)
            sim = (2 * maxNum) / (len(id2radicalstring[id1]) + len(id2radicalstring[id2]))

            if radical==True:
                sim = sim / pow(abs(cfnumradicla1 - cfnumradical2) + 1, 1.0 / 3)

            if sim>maxsim:
                maxsim=sim

    return maxsim





def clevelstringsimi(relatedRedical=True,radical=False):
    cpair2LCSsimi = {}
    cpair2LCSsimi_np = np.zeros((2543, 2543))
    idlist=list(range(2543))
    #print(idlist)
    for k, id in enumerate(idlist[:-1]):
        for idd in idlist[k + 1:]:
            sim=getcidsimi(index2cid[id],index2cid[idd],relatedRedical,radical)
            cpair2LCSsimi_np[id][idd] = sim
            cpair2LCSsimi_np[idd][id] = sim
    return cpair2LCSsimi_np







def Normalize(data):
    m = np.mean(data)
    mx = np.max(data)
    mn = np.min(data)
    data=(data-m)/(mx - mn)
    return data


def similarityRank(num,temp):
    #num：取排名最高的前几名

    max_value = []
    max_index = []
    for k in range(num):
        index_max = np.argmax(temp, axis=1)  # 其中，axis=1表示按行计算
        #print(index_max)
        max = temp[range(temp.shape[0]), index_max]
        #print(max)
        temp[range(temp.shape[0]), index_max] = -1
        max_value.append(max)
        max_index.append(index_max)
    max_index=np.array(max_index)
    max_value = np.array(max_value)
    return max_value.T, max_index.T


def showoutput(max_index,max_value,topk,level=0):
    cid2mappings={}
    c2mappings={}
    if level==0:
        for n in range(len(max_index)):  # 每一行，每一个字
            cid2mappings[index2id[n]] = []
            c2mappings[index2character[n]] = []
            for k in range(topk):
                index = max_index[n][k]
                cid2mappings[index2id[n]].append((index2id[index], max_value[n][k]))
                c2mappings[index2character[n]].append((index2character[index], max_value[n][k]))
    else:
        for n in range(len(max_index)):  # 每一行，每一个字
            cid2mappings[index2cid[n]] = []
            c2mappings[index2ccharacter[n]] = []
            for k in range(topk):
                index = max_index[n][k]
                cid2mappings[index2cid[n]].append((index2cid[index], max_value[n][k]))
                c2mappings[index2ccharacter[n]].append((index2ccharacter[index], max_value[n][k]))
    return cid2mappings,c2mappings





word_pairdic=readJia()
readJia1()
word_pairdic=readJia2(word_pairdic,word2id)

#得到文字级别的CSS相似性矩阵
cpair2LCSsimi_np1=clevelstringsimi(relatedRedical=True,radical=False)
#np.save('../data/results/jia_RLCS_simi_np', cpair2LCSsimi_np1)

#输出最终文字级别排序结果
max_value, max_index=similarityRank(50,cpair2LCSsimi_np1)
ccid2mappings,cc2mappings=showoutput(max_index,max_value,50,level=1)

print(cc2mappings['鼎'])
print(cc2mappings['宿'])
print(cc2mappings['木'])
print(cc2mappings['月'])
print(cc2mappings['犬'])
print(cc2mappings['刀'])
print(cc2mappings['降'])







