#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
SVM
Created on Mon Dec  7 11:55:36 2020

@author: fpannach
"""
import sys
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import ComplementNB
from sklearn import svm

#https://scikit-learn.org/stable/auto_examples/text/plot_document_classification_20newsgroups.html

#load data
def main() : 
    #lang = sys.argv[0]
    lang = "english"
    directory = "~/PhD/Code/SharedTask_HopeSpeech/HopeSpeechDetection/Baseline/"
    print(directory+lang+"_train_features_sent.csv")
    Corpus = pd.read_csv(directory+lang+"_train_features_sent.csv", index_col ="Unnamed: 0") #,index_col="Index")
    Corpus_test = pd.read_csv(directory+lang+"_dev_features_sent.csv", index_col = "Unnamed: 0")#,index_col="Index")
    data = prepare_data(Corpus, Corpus_test)

    if lang == "english" : 
        c = 1.4
        alpha = 0.7
    elif lang == "malayalam" : 
        c = 1.3
        alpha = 0.9
    elif lang == "tamil" : 
        c = 1.1
        alpha = 10.5

    svm_class(prepare_data(Corpus, Corpus_test), c)
    cnb_class(prepare_data(Corpus, Corpus_test), alpha)

def prepare_data(Corpus, Corpus_test): 
    Train_X = Corpus["text"]
    Test_X = Corpus_test["text"]
    
    Train_Y = Corpus["label"]
    Test_Y = Corpus_test["label"]
    
    Encoder = LabelEncoder()
    Train_Y = Encoder.fit_transform(Train_Y)
    Test_Y = Encoder.fit_transform(Test_Y)

    #map labels and numeric values
    le_name_mapping = dict(zip(Encoder.classes_, Encoder.transform(Encoder.classes_)))

    Train_X_Tfidf, Test_X_Tfidf = tfidf_vec(Corpus, Corpus_test)
    return [Train_X_Tfidf, Test_X_Tfidf, Train_Y, Test_Y]



def tfidf_vec(Corpus, Corpus_test):
    #tf-idf vectorization
    Tfidf_vect = TfidfVectorizer(max_features=5000)
    Tfidf_vect.fit(Corpus["text"])
    Train_X_Tfidf = Tfidf_vect.transform(Corpus["text"])
    Test_X_Tfidf = Tfidf_vect.transform(Corpus_test["text"])

    return Train_X_Tfidf, Test_X_Tfidf

# Standard NB
def nb_class(data):
    # fit the training dataset on the NB classifier
    Naive = naive_bayes.MultinomialNB()
    #Naive.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
    Naive.fit(data[0],data[2])# predict the labels on validation dataset
    predictions_NB = Naive.predict(data[1])# Use accuracy_score function to get the accuracy
    print("Naive Bayes Accuracy Score -> ",accuracy_score(data[3], predictions_NB)*100)
    print("Naive Bayes Precision Score -> ",precision_score(data[3], predictions_NB,average="weighted")*100)
    print("Naive Bayes F1 Score -> ",f1_score(data[3], predictions_NB,average="weighted", zero_division=0)*100)
    print("Naive Bayes Recall -> ",recall_score(data[3], predictions_NB,average="weighted")*100)

##optimize SVM
def optimize_svm(data) : 
    alpha = 0 
    f1_max = 0
    steps = [x * 0.1 for x in range(1, 15)]
    
    for step in steps : 
        alpha = step
        SVM = svm.SVC(C=alpha, kernel='linear', degree=3, gamma='auto')
        SVM.fit(data[0],data[2])# predict the labels on validation dataset
        predictions_svm = SVM.predict(data[1])# Use accuracy_score function to get the accuracy
        f1 = f1_score(data[3], predictions_svm,average="weighted", zero_division=0)*100
        if f1 > f1_max : 
            alpha_max = alpha
            f1_max = f1
    #print("SVM Best ----  Alpha:", alpha_max, "F1-Score:",f1_max)
    return alpha_max

def svm_class(data,c) : 
    # Classifier - Algorithm - SVM
    # fit the training dataset on the classifier
    SVM = svm.SVC(C=c, kernel='linear', degree=3, gamma='auto')
    #SVM.fit(Train_X_Tfidf,Train_Y)# predict the labels on validation dataset
    SVM.fit(data[0],data[2])# predict the labels on validation dataset
    predictions_SVM = SVM.predict(data[1],)# Use accuracy_score function to get the accuracy
    print("SVM Accuracy Score -> ",accuracy_score(data[3], predictions_SVM)*100)
    print("SVM Precision Score -> ",precision_score(data[3], predictions_SVM, average="weighted")*100)
    print("SVM F1 Score -> ",f1_score(data[3], predictions_SVM,average="weighted", zero_division=0)*100)
    print("SVM Recall-> ",recall_score(data[3], predictions_SVM,average="weighted")*100)


### NB optimization
def optimize_NB(data) : 
    alpha = 0 
    f1_max = 0
    steps = [x * 0.1 for x in range(0, 15)]
    
    for step in steps : 
        alpha = step
        cnb = naive_bayes.ComplementNB(alpha=alpha, norm=True)
        cnb.fit(data[0],data[2])# predict the labels on validation dataset
        predictions_cnb = cnb.predict(data[1])# Use accuracy_score function to get the accuracy
        f1 = f1_score(Test_Y, predictions_cnb,average="weighted", zero_division=0)*100
        if f1 > f1_max : 
            alpha_max = alpha
            f1_max = f1
    print("Best ----  Alpha:", alpha_max, "F1-Score:",f1_max)

def cnb_class(data, alpha) : 
    cnb = naive_bayes.ComplementNB(alpha=alpha, norm=True)
    cnb.fit(data[0],data[2])# predict the labels on validation dataset
    predictions_cnb = cnb.predict(data[1])# Use accuracy_score function to get the accuracy
    
    print("C Naive Bayes Accuracy Score -> ",accuracy_score(data[3], predictions_cnb)*100)
    print("C Naive Bayes Precision Score -> ",precision_score(data[3], predictions_cnb,average="weighted")*100)
    print("C Naive Bayes F1 Score -> ",f1_score(data[3], predictions_cnb,average="weighted", zero_division=0)*100)
    print("C Naive Bayes Recall -> ",recall_score(data[3], predictions_cnb,average="weighted")*100)

if __name__ == "__main__":
    main()
