input1 = open('lexicon/ch_lexicon')
input2 = open('lexicon/en_lexicon')
lines1 = input1.readlines()
lines2 = input2.readlines()
input1.close()
input2.close()
vol = set()
en_vol = set()
conf_table = {} #key is ch#en and value is confidence of this translation pair.
for line1,line2 in zip(lines1,lines2):
    line1 = line1.strip()
    line2 = line2.strip()
    conf_table[line1+'#'+line2] = 1.0
    vol.add(line1)
    en_vol.add(line2)
# feature decoding
ch_feature_list = []
input = open('ch_feature_index.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    ch_feature_list.append(attr)
input.close()

en_feature_list = []
input = open('en_feature_index.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    en_feature_list.append(attr)
input.close()

flag = False

#loading english graph to figure out which nodes are in the graph.
feature_in_graph = set()
input = open('en_graph_info.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    pair = eval(attr[0])
    for e in pair:
        feature_in_graph.add(int(e))
input.close()

#loading english time burst feature information
input = open('en_time_burst_feature.txt')
en_time_feature = {}
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t')
    time = int(attr[0])
    if time >= 365:
        flag = True
    if time not in en_time_feature:
        en_time_feature[time] = []
    if int(attr[2]) in feature_in_graph:
        en_time_feature[time].append(int(attr[2]))
input.close()

total_day = 364
if flag == True:
    total_day = 365

# loading graph information
cand = {} #recording the possible candidates of OOV.
input = open('ch_ad_table.txt')
lines = input.readlines()
for line in lines:
    line = line.strip()
    attr = line.split('\t') 
    term = int(attr[0])
    decode_feature = ch_feature_list[term]
    decode_term = decode_feature[0]
    decode_period = eval(decode_feature[1])
    if decode_term not in vol: #find possible candidates
        if term not in cand:
            cand[term] = set()
        possible_period = (decode_period[0],decode_period[1])
        if possible_period[0]<0:
            possible_period = (0,possible_period[1])
        if possible_period[1]>total_day:
            possible_period=(possible_period[0],total_day)
        # search possible translation from the burst words during possible period 
        for time in xrange(possible_period[0],possible_period[1]+1):
            if time not in en_time_feature:
                continue
            for e in en_time_feature[time]:
            #    if en_feature_list[e][0] not in en_vol: OOV case
                cand[term].add(e)

out = open('cand_list.txt','w')
for e in cand:
    if len(cand[e])>0:
        out.write(str(e)+'\t'+str(cand[e])+'\n') 
out.close()
