# -*- coding: utf-8 -*-
import sys
#INPUT = Two semantic similarity datasets ("file_1" and "file_2"), previously aligned line by line
#OUTPUT = A cross-lingual semantic similarity dataset ("txtfile"), which will created in the input directory and named "cross_" and the concatenation of the names of the input datasets.

def main(path, file_1, file_2, size_sim_scale):
    def cross_lingual_dataset_creation(file1,file2):
        dict_pair_score={}
        dict_pair_cont={}
        dataset_1=open(path+file_1).readlines()
        dataset_2=open(path+file_2).readlines()
        len_dataset_1=len(dataset_1)
        len_dataset_2=len(dataset_2)
        if len_dataset_1!=len_dataset_2: print "ERROR: files have different number of lines"
        for i in range(len_dataset_1):
            try:
                linesplit_1=dataset_1[i].split("\t")
                word_1_1=linesplit_1[0]
                word_1_2=linesplit_1[1]
                score_1=float(linesplit_1[2])
                linesplit_2=dataset_2[i].split("\t")
                word_2_1=linesplit_2[0]
                word_2_2=linesplit_2[1]
                score_2=float(linesplit_2[2])
                pair_1=(word_1_1,word_2_2)
                pair_2=(word_1_2,word_2_1)
                if score_1>score_2:
                    max_score=score_1
                    min_score=score_2
                else:
                    max_score=score_2
                    min_score=score_1
                if max_score-min_score<=(size_sim_scale/4.0):
                    average_score=(score_1+score_2)/2
                    if pair_1 not in dict_pair_score:
                        dict_pair_score[pair_1]=average_score
                        dict_pair_cont[pair_1]=1
                    else:
                        dict_pair_score[pair_1]=((dict_pair_score[pair_1]*dict_pair_cont[pair_1])+average_score)/(dict_pair_cont[pair_1]+1)
                        dict_pair_cont[pair_1]+=1
                    
                    if pair_2 not in dict_pair_score:
                        dict_pair_score[pair_2]=average_score
                        dict_pair_cont[pair_2]=1
                    else:
                        dict_pair_score[pair_2]=((dict_pair_score[pair_2]*dict_pair_cont[pair_2])+average_score)/(dict_pair_cont[pair_2]+1)
                        dict_pair_cont[pair_2]+=1
            except: print "WARNING: line "+str(i+1)+" of one of the datasets does not follow the standard format"

        return dict_pair_score

    dict_cross_dataset=cross_lingual_dataset_creation(file_1,file_2)
    txtfile=open(path+"cross_"+file_1.replace(".txt",'')+"_"+file_2.replace(".txt",'')+".txt",'w') #Output file named "cross_" plus the concatenation of the two input datasets names 
    number_of_pairs=len(dict_cross_dataset)
    cont_pair=0
    for pair in dict_cross_dataset:
        cont_pair+=1
        txtfile.write(pair[0]+"\t"+pair[1]+"\t"+str(dict_cross_dataset[pair]))
        if cont_pair!=number_of_pairs: txtfile.write("\n")
    txtfile.close()
    print "Creation of the cross-lingual dataset finished"

if __name__ == '__main__':

    args = sys.argv[1:]

    if len(args) == 4:

        path = args[0]
        file_1 = args[1]
        file_2 = args[2]
        size_sim_scale = float(args[3])

        main(path, file_1, file_2, size_sim_scale)

    else:
        sys.exit('''
            Requires:
            path -> Path of the monolingual datasets' directory and path where the cross-lingual dataset will be created (by default it is the same path)
            file_1 -> File name of the first dataset
            file_2 -> File name of the second dataset
            size_sim_scale -> Size of the similarity scale

            ''')
