import os
import sys
day_dict = []
day_count = []
year_dict = {}
year_count = 0

month_day_ping = [0,31,59,90,120,151,181,212,243,273,304,334]
month_day_run = [0,31,60,91,121,152,182,213,244,274,305,335]

flag = False
if int(sys.argv[2]) %4 == 0:
    flag = True

def startday(date):#note the year is ping or run.
    y = int(date[0:4])
    m = int(date[4:6])
    d = int(date[6:8])
    if flag == False:
        return month_day_ping[m-1]+d-1
    return month_day_run[m-1] + d - 1 # run nian.

#init
total_day = 365
if flag == True:
    total_day = 366
for i in xrange(0,total_day):
    day_dict.append({})
    day_count.append(0)


monthlist = os.listdir(sys.argv[1])
if 'en_stream' in sys.argv[1]:
    outpath = 'en_burst/'
    match_str = 'ENG_'
else:
    outpath = 'ch_burst/'
    match_str = 'CMN_'
print 'loading...'
for month in monthlist:
    if not month.startswith(sys.argv[2]):
        continue
    print month
    filelist = os.listdir(sys.argv[1]+'/'+month)
    y = int(month[0:4])
    year_count = year_count + len(filelist)
    for file in filelist:
        date = file[file.find(match_str)+4:]
        date = date[0:8]
        day = startday(date)
        day_count[day] = day_count[day]+1
        input = open(sys.argv[1]+'/'+month+'/'+file)
        lines = input.readlines()
        for line in lines:
            line = line.strip()
            tokens = line.split(' ')
            for token in tokens:
                if token == '':
                    continue
                if '#' in token:
                    if len(token)==1:
                        continue
                    token = token[0:token.rfind('#')]
                if token in day_dict[day]:
                    day_dict[day][token] = day_dict[day][token]+1
                else:
                    day_dict[day][token] = 1
                if token in year_dict:
                    year_dict[token] = year_dict[token] + 1
                else:
                    year_dict[token] = 1
        input.close()

year_normtf = {}

print 'computing norm...'
output = open(outpath+'year.tf','w')
for e in year_dict:
    year_normtf[e] = year_dict[e]*1.0/year_count
    output.write(e+'\t'+str(year_normtf[e])+'\t'+str(year_dict[e])+'\n')
output.close()

# analysis of burst

print 'burst analysis...'
for i in xrange(0,total_day):
    output2 = open(outpath+'day_'+str(i)+'.tf','w')
    output = open(outpath+'day_'+str(i)+'.b','w')
    for e in day_dict[i]:
        tf_doc = day_dict[i][e] * 1.0 / day_count[i]
        burstrate = tf_doc/year_normtf[e]
        output2.write(e+'\t'+str(day_dict[i][e])+'\t'+str(tf_doc)+'\t'+str(burstrate)+'\n')
        if burstrate >= 3 and day_dict[i][e] >= 3:
            output.write(e+'\t'+str(burstrate)+'\t'+str(day_dict[i][e])+'\t'+str(tf_doc)+'\n')
    output.close()
    output2.close()

print 'done.'
