import pandas as pd
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

class DataProcessor:
    def __init__(self, dataFile, emojiFile):
        self.emoji_vocab = self.getVecFromEmoji(emojiFile)
        self.train_off_unt, self.train_off_tin = self.getSentences(dataFile)
        self.train_off_unt_leng = len(self.train_off_unt)
        self.train_off_tin_leng = len(self.train_off_tin)

    def prePro_1(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("@user", " <user> ")
        sentence = sentence.replace("'m", " 'm")
        sentence = sentence.replace("’m", " ’m")
        sentence = sentence.replace("'re", " 're")
        sentence = sentence.replace("’re", " ’re")
        sentence = sentence.replace("'ll", " 'll")
        sentence = sentence.replace("’ll", " ’ll")
        sentence = sentence.replace("'ve", " 've")
        sentence = sentence.replace("’ve", " ’ve")
        sentence = sentence.replace("'d", " 'd")
        sentence = sentence.replace("’d", " ’d")
        sentence = sentence.replace("n't", " n't")
        sentence = sentence.replace("n’t", " n’t")
        sentence = sentence.replace("'s", " 's")
        sentence = sentence.replace("’s", " ’s")
        sentence = sentence.replace("s'", " s'")
        sentence = sentence.replace("s’", " s’")
        sentence = sentence.replace(".", " . ")
        sentence = sentence.replace(",", " , ")
        sentence = sentence.replace("!", " ! ")
        sentence = sentence.replace("！", " ！ ")
        sentence = sentence.replace("?", " ? ")
        sentence = sentence.replace("？", " ？ ")
        sentence = sentence.replace("#", " # ")
        sentence = sentence.replace("$", " $ ")
        sentence = sentence.replace("\"", " \" ")
        sentence = sentence.replace("”", " ” ")
        sentence = sentence.replace("“", " “ ")
        sentence = sentence.replace(";", " ; ")
        sentence = sentence.replace(":", " : ")
        sentence = sentence.replace("(", " ( ")
        sentence = sentence.replace(")", " ) ")
        sentence = sentence.replace("[", " [ ")
        sentence = sentence.replace("]", " ] ")
        sentence = sentence.replace("{", " { ")
        sentence = sentence.replace("}", " } ")
        sentence = sentence.replace("+", " + ")
        sentence = sentence.replace("-", " - ")
        sentence = sentence.replace("*", " * ")
        sentence = sentence.replace("/", " / ")
        sentence = sentence.replace("=", " = ")
        sentence = sentence.replace("—", " — ")
        sentence = sentence.replace("…", " … ")
        sentence = sentence.replace("•", " • ")
        sentence = sentence.replace(" ", " ")
        sentence = sentence.replace("🏻", " ")
        sentence = sentence.replace("🏼", " ")
        sentence = sentence.replace("🏽", " ")
        sentence = sentence.replace("🏾", " ")
        sentence = sentence.replace("♂", " ")
        sentence = sentence.replace("♀", " ")
        return sentence

    def prePro_2_attEmoji(self, sentence, getEmoji=True):
        emoji_encode_list = []
        for key in self.emoji_vocab:
            if sentence.find(key) >= 0:
                emoji_encode_list.append(self.emoji_vocab[key])
                # sentence = sentence.replace(key, " ")
                sentence = sentence.replace(key, " " + key + " ")
        if getEmoji == False:
            return sentence
        return sentence, emoji_encode_list

    def getVecFromEmoji(self, modelFile):
        vocab = {}
        vocab['__nok__'] = 0
        with open(modelFile, "rb") as f:
            i = 1
            for line in f.readlines():
                line = line.decode(encoding="utf-8").replace(" \r\n", "")
                line = line.split(' ')
                vocab[line[0]] = i
                i += 1
                if i == 1661 + 1:
                    break
            f.close()
        return vocab

    def getSentences(self, file):
        train = pd.read_csv(file, sep='\t', header=0)
        train_off_unt = []
        train_off_tin = []
        for i in range(len(train)):
            sentence = str(train['tweet'][i])
            sentence = self.prePro_1(sentence)
            sentence = self.prePro_2_attEmoji(sentence, False)
            if str(train['subtask_a'][i]) == 'OFF' and str(train['subtask_b'][i]) == 'UNT':
                train_off_unt.append(sentence)
            elif str(train['subtask_a'][i]) == 'OFF' and str(train['subtask_b'][i]) == 'TIN':
                train_off_tin.append(sentence)
        return train_off_unt, train_off_tin

    def getDivided(self, cross_deviation):
        train_labels = []
        valid_labels = []
        if cross_deviation == 0:
            ver_nut_start = 105 * cross_deviation
            ver_nut_end = 105 * (cross_deviation + 1)
            ver_tin_start = 0
            ver_tin_end = 776
        elif cross_deviation == 4:
            ver_nut_start = 420
            ver_nut_end = 524
            ver_tin_start = 775 * cross_deviation
            ver_tin_end = 775 * (cross_deviation + 1)
        else:
            ver_nut_start = 105 * cross_deviation
            ver_nut_end = 105 * (cross_deviation + 1)
            ver_tin_start = 775 * cross_deviation
            ver_tin_end = 775 * (cross_deviation + 1)
        t_unt = self.train_off_unt[0:ver_nut_start] + self.train_off_unt[ver_nut_end:self.train_off_unt_leng]
        t_tin = self.train_off_tin[0:ver_tin_start] + self.train_off_tin[ver_tin_end:self.train_off_tin_leng]
        train_docs = t_unt + t_tin
        for i in range(len(t_unt)):
            train_labels.append(0)
        for i in range(len(t_tin)):
            train_labels.append(1)

        for i in range(len(t_tin) - len(t_unt)):
            index = np.random.randint(0, len(t_unt))
            train_docs.append(t_unt[index])
            train_labels.append(0)

        valid_docs = self.train_off_unt[ver_nut_start:ver_nut_end] + self.train_off_tin[ver_tin_start:ver_tin_end]
        for i in range(ver_nut_end - ver_nut_start):
            valid_labels.append(0)
        for i in range(ver_tin_end - ver_tin_start):
            valid_labels.append(1)
        return train_docs, train_labels, valid_docs, valid_labels

def create_model(train_docs, train_labels, valid_docs, valid_labels):
    vectorizer = TfidfVectorizer(use_idf=True, token_pattern=r"(?u)\b\w+\b", max_df=0.5, min_df=1)
    train_X = vectorizer.fit_transform(train_docs, train_labels)
    valid_X = vectorizer.transform(valid_docs)

    ch2 = SelectKBest(chi2, k=1500)
    train_X = ch2.fit_transform(train_X, train_labels)
    valid_X = ch2.transform(valid_X)

    clf = LinearSVC(dual=False)
    clf.fit(train_X, train_labels)
    pred = clf.predict(valid_X)
    f1 = f1_score(valid_labels, pred, average='macro')
    return f1

dp = DataProcessor("D:/SemEval2019-Task6/workspace/data/train/offenseval-training-v1.tsv",
                   "D:/SemEval2019-Task6/workspace/wordModel/emoji2vec200.txt")
train_docs, train_labels, valid_docs, valid_labels = dp.getDivided(0)

# pred = create_model(train_docs, train_labels, valid_docs, valid_labels)
# Tunt, Funt, Ttin, Ftin = 0., 0., 0., 0.
# for i in range(len(pred)):
#     if pred[i] == 0 and valid_labels[i] == 0:
#         Tunt += 1
#     elif pred[i] == 1 and valid_labels[i] == 0:
#         Funt += 1
#     elif pred[i] == 1 and valid_labels[i] == 1:
#         Ttin += 1
#     elif pred[i] == 0 and valid_labels[i] == 1:
#         Ftin += 1
# macro_precision_not = Tunt / (Tunt + Funt)
# macro_precision_off = Ttin / (Ttin + Ftin)
# macro_recall_not = Tunt / (Tunt + Ftin)
# macro_recall_off = Ttin / (Ttin + Funt)
# macro_f1_not = (2 * macro_precision_not * macro_recall_not) / (macro_precision_not + macro_recall_not)
# macro_f1_off = (2 * macro_precision_off * macro_recall_off) / (macro_precision_off + macro_recall_off)
# mean_macro_accuracy = (Tunt + Ttin) / (Tunt + Funt + Ttin + Ftin)
# mean_macro_precision = (macro_precision_not + macro_precision_off) / 2
# mean_macro_recall = (macro_recall_not + macro_recall_off) / 2
# mean_macro_f1 = (macro_f1_not + macro_f1_off) / 2
# print(Tunt)
# print(Funt)
# print(Ttin)
# print(Ftin)
# print(mean_macro_f1)

print(create_model(train_docs, train_labels, valid_docs, valid_labels))
