from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
from sklearn.metrics import classification_report

import pandas, numpy, string
import pickle as pkl


texts=open('train_sentence_ln_tag.txt','r',encoding='utf-8',errors='ignore').read().split('\n')
labels=open('train_label.txt','r',encoding='utf-8',errors='ignore').read().split('\n')
test_text=open('test_sentence_ln_tag.txt','r',encoding='utf-8',errors='ignore').read().split('\n')

train_x=texts[:11335]
train_y=labels[:11335]

valid_x=texts[11335:]
valid_y=labels[11335:]

test_x=test_text

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)


# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(texts)

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)
xtest_count = count_vect.transform(test_x)

# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(texts)
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)
xtest_tfidf =  tfidf_vect.transform(test_x)

def train_model(classifier, feature_vector_train, label, feature_vector_test, model_name):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on testation dataset
    predictions = classifier.predict(feature_vector_test)
    pkl.dump(classifier, open(model_name,'wb'))
    
    return metrics.accuracy_score(predictions, valid_y), classification_report (predictions, valid_y)
    

# Naive Bayes on Count Vectors
accuracy, classi = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count, 'NB_CV_ln.pkl')
print ("NB, Count Vectors: ", accuracy)
print ('\n',classi)

# Naive Bayes on Word Level TF IDF Vectors
accuracy, classi = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf, 'NB_TFIDF_ln.pkl')
print ("NB, WordLevel TF-IDF: ", accuracy)
print ('\n',classi)
# Linear Classifier on Count Vectors
accuracy, classi = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count, 'LR_CV_ln.pkl')
print ("LR, Count Vectors: ", accuracy)
print ('\n',classi)
# Linear Classifier on Word Level TF IDF Vectors
accuracy, classi = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf, 'LR_TFIDF_ln.pkl')
print ("LR, WordLevel TF-IDF: ", accuracy)
print ('\n',classi)
# SVM on Count Vectors
accuracy, classi =train_model(svm.SVC(), xtrain_count, train_y, xvalid_count, 'SVM_CV_ln.pkl')
print ("SVM, Count Vectors: ", accuracy)
print ('\n',classi)
# SVM on Word Level TF IDF Vectors
accuracy, classi =train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf, 'SVM_TFIDF_ln.pkl')
print ("SVM, WordLevel TF-IDF: ", accuracy)
print ('\n',classi)
# RF on Count Vectors
accuracy, classi =train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count, 'RF_CV_ln.pkl')
print ("RF, Count Vectors: ", accuracy)
print ('\n',classi)
# RF on Word Level TF IDF Vectors
accuracy, classi =train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf, 'RF_TFIDF_ln.pkl')
print ("RF, WordLevel TF-IDF: ", accuracy)
print ('\n',classi)

##Testing the model##
##comment the training part##
# model=pkl.load(open('LR_TFIDF_ln.pkl','rb'))
# predictions=model.predict(test_x)
# for i in range(len(predictions)):
#     print (test_x[i],'\t',encoder.inverse_transform([predictions[i]])[0],end='\n')






