# -*- coding: utf-8 -*-
"""MBERT.ipynb

Automatically generated by Colaboratory.
mBERT based classification

"""

!pip install transformers

!pip install -U sentence-transformers

from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")


def opensheet(location,sheet_name):
    wb = xlrd.open_workbook(location)
    sheet = wb.sheet_by_name(sheet_name)
    print (sheet.cell_value(0,0))
    print ('***********')
    return sheet

import xlrd
dev_comm=[]
dev_label = []
loc = '/content/tamil_offensive_full_dev.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_dev')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    dev_comm.append(cmmt)    
    dev_label.append(sheet.cell_value(i,1))    

print (dev_comm[0])

maldev_encodedout=[]
for text in dev_comm:
  print (text)
  encoded_input = tokenizer(text,truncation="longest_first",max_length=512, return_tensors='tf')
  output = model(encoded_input)
  maldev_encodedout.append(output[0][0][0])

print (len(maldev_encodedout))

train_comm=[]
train_label = []
loc = '/content/tamil_offensive_full_train.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_train')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    train_comm.append(cmmt)    
    train_label.append(sheet.cell_value(i,1)) 

print (train_comm[0])

train_embeddings=[]
for text in train_comm:
  print (text)
  encoded_input = tokenizer(text,truncation="longest_first",max_length=512, return_tensors='tf')  
  output = model(encoded_input)
  train_embeddings.append(output[0][0][0])


#train_embeddings = model.encode(train_comm)

from sklearn.preprocessing import LabelEncoder

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(dev_label)
dev_encoded_Y = encoder.transform(dev_label)
# convert integers to dummy variables (i.e. one hot encoded)
#dummy_y = np_utils.to_categorical(encoded_Y)

encoder = LabelEncoder()
encoder.fit(train_label)
train_encoded_Y = encoder.transform(train_label)

test_comm=[]
#test_label = []
loc = '/content/tamil_offensive_full_test.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_test')
print(sheet.cell_value(0, 0) )
for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    test_comm.append(cmmt)    

print (test_comm[0])

test_embeddings = model.encode(test_comm)

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.callbacks import EarlyStopping, ModelCheckpoint

# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(100, input_dim=768, activation='relu'))
	model.add(Dense(5, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

import numpy as np
np.unique(dev_encoded_Y)
np.unique(train_encoded_Y)

from keras.utils import to_categorical
train_encoded_Y=to_categorical(train_encoded_Y)
dev_encoded_Y=to_categorical(dev_encoded_Y)

estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=5, verbose=1,validation_data=(dev_embeddings,dev_encoded_Y))

kfold = KFold(n_splits=10, shuffle=True)

results = cross_val_score(estimator, train_embeddings, train_encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

estimator.fit(train_embeddings, train_encoded_Y)
predictions = estimator.predict(test_embeddings)
print(predictions)

labelpredictions = encoder.inverse_transform(predictions)
print (labelpredictions)

import numpy as np
import pandas as pd
prediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('/content/prediction.csv')
predictionclass = pd.DataFrame(labelpredictions,columns=['labelpredictions']).to_csv('/content/predictionlabel.csv')
