import re

print 'reading lexicon'
input = open('lexicon/ch_lexicon')
lines1 = input.readlines()
input.close()
input = open('lexicon/en_lexicon')
lines2 = input.readlines()
input.close()

trans_dict = {}
for line1,line2 in zip(lines1,lines2):
    line1 = line1.strip()
    line2 = line2.strip()
    line2 = line2.lower()
    attr = line2.split(' ')
    if len(attr)>1:
        line2 = ''
        for i in xrange(0,len(attr)):
            line2 += attr[i]
            if i<len(attr)-1:
                line2+='_'
    if line1 not in trans_dict:
        trans_dict[line1] = set()
    trans_dict[line1].add(line2)

print 'reading ch and en features'
input = open('ch_feature_index.txt')
lines = input.readlines()
ch_feature_decode = []
ch_feature_encode = {}
feature_index = 0
for line in lines:
    line = line.strip()
    ch_feature_decode.append(line)
    ch_term = line.split('\t')[0]
    if ch_term not in ch_feature_encode:
        ch_feature_encode[ch_term] = set()
    ch_feature_encode[ch_term].add(feature_index)
    feature_index += 1
input.close()

input = open('en_feature_index.txt')
lines = input.readlines()
en_feature_decode = []
en_feature_encode = {}
feature_index = 0
for line in lines:
    line = line.strip()
    en_feature_decode.append(line)
    en_term = line.split('\t')[0]
    if en_term not in en_feature_encode:
        en_feature_encode[en_term] = set()
    en_feature_encode[en_term].add(feature_index)
    feature_index += 1
input.close()

pattern = re.compile('[A-Za-z0-9]+')
letter_pattern = re.compile('[A-Za-z]+')

ch_feature_in_lexicon_set = set()

out = open('init_cand.txt','w')
print 'outputing...'
for ch_term in trans_dict:
    en_cand = trans_dict[ch_term]
    for en_term in en_cand:
        if en_term in en_feature_encode and ch_term in ch_feature_encode:
            for en_term_index in en_feature_encode[en_term]:
                for ch_term_index in ch_feature_encode[ch_term]:
                    ch_feature_in_lexicon_set.add(ch_term_index)
                    out.write(str(ch_term_index)+'\t'+str(en_term_index)+'\t'+str(1.0)+'\n')


prior = 0.5
input = open('narrow_cand_list.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    ch_feature = int(attr[0])
    if ch_feature in ch_feature_in_lexicon_set:
        continue
    en_feature_cand = eval(attr[1])
    n_en_feature_cand = len(en_feature_cand)
    init_score = prior*1.0/n_en_feature_cand
    for e in en_feature_cand:
        if len(letter_pattern.findall(ch_feature_decode[ch_feature]))>0:
            init_score = 1.0
        out.write(attr[0]+'\t'+str(e)+'\t'+str(init_score)+'\n')
input.close()
out.close()

