# coding=utf-8
__author__ = 'boliangzhang'

import io
import word2vec
import operator
from os import listdir
from gensim import models


TWEET_POSTAGGING_PATH = "/Users/boliangzhang/Documents/Phd/Morph_Encoding/Experiments/Other_resources/tweet_postagging_LARGE/"
TWEET_BIN = "./tweets_500d.bin"
WIKI_BIN = "./enwik9.bin"


def create_tweets_corpus():
    files = listdir(TWEET_POSTAGGING_PATH)
    f_output = io.open("./word2vec.txt", 'w', -1, 'utf-8')
    for f in files:
        print(f)
        sub_folder = listdir(TWEET_POSTAGGING_PATH + f)
        for sub_f in sub_folder:
            tweet = io.open(TWEET_POSTAGGING_PATH + f + '/' + sub_f, 'r', -1, 'utf-8')
            lines = []
            for l in tweet:
                lines.append(l)
            for i in range(0, len(lines) / 3):
                f_output.write(lines[(i + 1) * 3 - 2])
            tweet.close()


def compare_semantic_similarity(a, b):
    model = models.Word2Vec.load_word2vec_format(TWEET_BIN, binary=True)
    simi = model.similarity(a, b)
    return simi


def train_model():
    word2vec.word2vec("/Users/boliangzhang/Documents/Phd/MSRA_intern/word2vector/tweets_postagging.txt",
                      "/Users/boliangzhang/Documents/Phd/MSRA_intern/word2vector/word2vec.bin",
                      size=300, verbose=True, hs=1, window=10)


def read_annotation():
    annotations = dict()
    f = io.open("../data/Annotations/morph_annotation_updated", 'r', -1, 'utf-8')
    for line in f:
        line = line.strip().split('\t')
        morph = line[0]
        entities = line[1:]
        annotations[morph] = entities
    return annotations


def read_candidates():
    f = io.open("../data/NER/ner_tweet.txt", 'r', -1, 'utf-8')
    candidates = list()
    for line in f:
        candidate = line.strip().split('\t')[0]
        candidates.append(candidate)
    return candidates


def candidate_ranking(annotations, candidates):
    print("loading BIN to model...")
    model = models.Word2Vec.load_word2vec_format(TWEET_BIN, binary=True)
    print("Done\n")
    candidate_ranking_dict = dict()
    for a in annotations:
        morph = a
        candidates_score = dict()
        for candidate in candidates:
            try:
                simi = model.similarity(morph.encode('utf-8'), candidate.encode('utf-8'))
                candidates_score[candidate] = simi
            except:
                candidates_score[candidate] = 0
        sorted_candidate_score = sorted(candidates_score.iteritems(), key=operator.itemgetter(1), reverse=True)
        candidate_ranking_dict[morph] = sorted_candidate_score
    return candidate_ranking_dict


def do_evaluation(annotations, candidate_ranking_dict, top_n):
    hits = list()
    annotation_size = len(annotations)
    # a is the morph that need to be resolved
    for m in candidate_ranking_dict:
        # candidate_ranking_list is a sorted candidates for morph a
        candidate_ranking_list = candidate_ranking_dict[m]
        # print morphs not found in corpus
        # if candidate_ranking_list[0][1] == 0:
        #     print(a)
        # entity is the correct resolution for morph a
        try:
            a = annotations[m]
        except:
            print(m + 'not in annotations')
            continue
        for entity in annotations[m]:
            for c in candidate_ranking_list[0:top_n]:
                # if entity == c[0]:
                #     hits += 1
                #     print(entity + ' ' + c[0])
                #     break
                # if entity in c[0] or c[0] in entity:
                if entity == c[0]:
                    hits.append(entity)
                    # print(m + ' ' + c[0] + '\t'),
                    break

    print('\t'.join(hits))

    return float(len(hits)) / len(candidate_ranking_dict)


def print_results(candidate_ranking_dict, path):
    for m in candidate_ranking_dict:
        candidate_ranking_list = candidate_ranking_dict[m]
        output = io.open(path + m, 'w', -1, 'utf-8')
        for i in range(len(candidate_ranking_list)):
            output.write(candidate_ranking_list[i][0] + '\t' + str(candidate_ranking_list[i][1]).encode('utf-8') + '\n')


def main():
    # create_tweets_corpus()

    # train_model()

    annotations = read_annotation()
    candidates = read_candidates()

    candidate_ranking_dict = candidate_ranking(annotations, candidates)

    # print candidate ranking detail
    print_results(annotations, candidate_ranking_dict)

    # top 1
    print('top 1:')
    print(do_evaluation(annotations, candidate_ranking_dict, 1))

    # top 5
    print('top 5:')
    print(do_evaluation(annotations, candidate_ranking_dict, 5))

    # top 10.
    print('top 10:')
    print(do_evaluation(annotations, candidate_ranking_dict, 10))

    # top 20
    print('top 20:')
    print(do_evaluation(annotations, candidate_ranking_dict, 20))

    # print(compare_semantic_similarity("薄熙来", "飞机"))


def read_evaluation_set():
    evaluation_set = dict()
    morph_file = listdir('../data/Hongzhao_evaluation_set')
    for m in morph_file:
        candidates = list()
        f = io.open('../data/Hongzhao_evaluation_set/' + m, 'r', -1, 'utf-8')
        for line in f:
            candidates.append(line.split('\t')[0])
        evaluation_set[m.decode('utf-8')] = candidates
    return evaluation_set


def candidate_ranking_for_Hongzhao(evaluation_set):
    print("loading BIN to model...")
    model = models.Word2Vec.load_word2vec_format(TWEET_BIN, binary=True)
    print("Done\n")
    candidate_ranking_dict = dict()
    for morph in evaluation_set:
        candidates_score = dict()
        for candidate in evaluation_set[morph]:
            try:
                simi = model.similarity(morph.encode('utf-8'), candidate.encode('utf-8'))
                candidates_score[candidate] = simi
            except:
                candidates_score[candidate] = 0
        sorted_candidate_score = sorted(candidates_score.iteritems(), key=operator.itemgetter(1), reverse=True)
        candidate_ranking_dict[morph] = sorted_candidate_score
    return candidate_ranking_dict


def main_for_hongzhao():
    annotations = read_annotation()

    evaluation_set = read_evaluation_set()

    candidate_ranking_dict = candidate_ranking_for_Hongzhao(evaluation_set)

    # print candidate ranking detail
    print_results(candidate_ranking_dict, '../data/word2vec_exp_results/')

    # top 1
    print('top 1:')
    print(do_evaluation(annotations, candidate_ranking_dict, 1))

    # top 5
    print('top 5:')
    print(do_evaluation(annotations, candidate_ranking_dict, 5))

    # top 10.
    print('top 10:')
    print(do_evaluation(annotations, candidate_ranking_dict, 10))

    # top 20
    print('top 20:')
    print(do_evaluation(annotations, candidate_ranking_dict, 20))

    print('top 50:')
    print(do_evaluation(annotations, candidate_ranking_dict, 50))

    # print(compare_semantic_similarity("薄熙来", "飞机"))


def results_processing():
    final_results = dict()
    files = listdir('../data/final_experiment_results')
    for file_name in files:
        f = io.open('../data/final_experiment_results/' + file_name, 'r', -1, 'utf-8')
        candidates_ranking = dict()
        for line in f:
            line = line.strip().split('\t')
            candidates_ranking[line[0]] = abs(float(line[1]))
        max_score = max(candidates_ranking.values())
        for candidate in candidates_ranking:
            try:
                candidates_ranking[candidate] /= max_score
            except ZeroDivisionError:
                candidates_ranking[candidate] = 0
        sorted_candidate_score = sorted(candidates_ranking.iteritems(), key=operator.itemgetter(1), reverse=True)
        final_results[file_name.decode('utf-8')] = sorted_candidate_score

    print_results(final_results, '../data/final_experiment_results_normalized/')

    annotations = read_annotation()

    # top 1
    print('acc @ 1: '),
    print(do_evaluation(annotations, final_results, 1))

    # top 5
    print('acc @ 5: '),
    print(do_evaluation(annotations, final_results, 5))

    # top 10.
    print('acc @ 10: '),
    print(do_evaluation(annotations, final_results, 10))

    # top 20
    print('acc @ 20: '),
    print(do_evaluation(annotations, final_results, 20))

    # print('top 50:')
    # print(do_evaluation(annotations, final_results, 50))




if __name__ == "__main__":
    # main()
    # main_for_hongzhao()
    results_processing()

