import re
import LCS
#load chinese and english feature index

ch_feature = []
en_feature = []
input = open('ch_feature_index.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    ch_feature.append(attr)
input.close()

input = open('en_feature_index.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    en_feature.append(attr)
input.close()

input =open('lexicon/en_lexicon')
lines1 = input.readlines()
input.close()
input = open('lexicon/ch_lexicon')
lines2 = input.readlines()
input.close()
en_ch_lexicon = {}
for line1,line2 in zip(lines1,lines2):
    line1 = line1.strip()
    line2 = line2.strip()
    line1 = line1.lower()
    attr =line1.split(' ')
    if len(attr)>1:
        line1=''
        for i in xrange(0,len(attr)):
            line1+=attr[i]
            if i<len(attr)-1:
                line1+='_'
    if line1 not in en_ch_lexicon:
        en_ch_lexicon[line1] = set()
    en_ch_lexicon[line1].add(line2)

#get english and digit.
pattern = re.compile('[0-9a-zA-Z]+')
digit_pattern = re.compile('[0-9]+')
cap_pattern = re.compile('[A-Z][a-z0-9]+')

input = open('cand_list.txt')
out = open('narrow_cand_list.txt','w')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    ch_term = ch_feature[int(attr[0])][0]
    cap_list = cap_pattern.findall(ch_term)
    if len(cap_list)>1:
        ch_term = ''
        for i in xrange(0,len(cap_list)):
            ch_term += cap_list[i]
            if i<len(cap_list)-1:
                ch_term += '_'
    match_list = pattern.findall(ch_term)
    cand = set()
    if len(match_list) > 0:
        for e in match_list:
            digi_match = digit_pattern.findall(ch_term)
            if len(digi_match)==0:
                n_term = e.lower()
                for f in eval(attr[1]):
                    en_term = en_feature[f][0]
                    #print en_term
                    word_group = pattern.findall(en_term)
                    if len(cap_list)>1 and len(word_group)>1:
                        part_count = 0
                        for cap_e in cap_list:
                            cap_e = cap_e.lower()
                            for g in word_group:
                                if cap_e == g:
                                    part_count += 1
                        if part_count <= 1:
                            continue
                    concat = ''
                    for g in word_group:
                        #print g
                        concat+=g
                        if g == n_term:
                            cand.add(f)
                    if concat == n_term:
                        cand.add(f)
            else:
                print digi_match
                zheng = ''
                fan = ''
                for i in xrange(0,len(digi_match)):
                    zheng += digi_match[i]
                for i in xrange(len(digi_match)-1,-1,-1):
                    fan += digi_match[i]
                for f in eval(attr[1]):
                    en_term = en_feature[f][0]
                    digi_group = digit_pattern.findall(en_term)
                    en_digit_concat = ''
                    for one_group in digi_group:
                        en_digit_concat += one_group
                    if zheng == en_digit_concat or fan == en_digit_concat:
                        cand.add(f)
                break
        if len(cand)>0:
            out.write(attr[0]+'\t'+str(cand)+'\n')
    else:
        old_cand = eval(attr[1])
        new_cand = set()
        for old_en in old_cand:
            oov_bool = True
            en_term = en_feature[old_en][0]
            en_match_list = pattern.findall(en_term)
            if en_term in en_ch_lexicon:
                oov_bool = False
                ch_lexicon_set = en_ch_lexicon[en_term]
                for ch_lexicon in ch_lexicon_set:
                    lcs = LCS.LCSlength(ch_term,ch_lexicon)
                    if lcs > 0:
                        new_cand.add(old_en)
                        break
            if oov_bool == False:
                continue
            digi_match = digit_pattern.findall(en_term)
            if len(digi_match)>0:
                continue
            if len(en_match_list)==1:
                if en_term in en_ch_lexicon:
                    oov_bool = False
                    ch_lexicon_set = en_ch_lexicon[en_term]
                    for ch_lexicon in ch_lexicon_set:
                        lcs = LCS.LCSlength(ch_term,ch_lexicon)
                        if lcs > 0:
                            new_cand.add(old_en)
                            break
            else:
                lcs_sum = 0
                oov_bool = True    # at the beginning, oov_bool = True
                for en_match_group in en_match_list:
                    if en_match_group in en_ch_lexicon:
                        oov_bool = False    # in the new version, this line is removed.
                        ch_lexicon_set = en_ch_lexicon[en_match_group]
                        max_lcs = 0
                        for ch_lexicon in ch_lexicon_set:
                            lcs = LCS.LCSlength(ch_term,ch_lexicon)
                            if lcs>max_lcs:
                                max_lcs = lcs
                        lcs_sum += max_lcs
                if lcs_sum > 0:
                    new_cand.add(old_en)
            if oov_bool == True:
                new_cand.add(old_en)
        if len(new_cand)>0:
            out.write(attr[0]+'\t'+str(new_cand)+'\n')
input.close()
out.close()
