# -*- coding: utf-8 -*-
"""LaBSE.ipynb

Automatically generated by Colaboratory.
Language Agnostice BERT based classification
"""

!pip install transformers

!pip install -U sentence-transformers

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('LaBSE')

def opensheet(location,sheet_name):
    wb = xlrd.open_workbook(location)
    sheet = wb.sheet_by_name(sheet_name)
    print (sheet.cell_value(0,0))
    print ('***********')
    return sheet

import xlrd
dev_comm=[]
dev_label = []
loc = '/content/mal_full_offensive_dev.xlsx'
sheet = opensheet(loc,'mal_full_offensive_dev')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    dev_comm.append(cmmt)    
    dev_label.append(sheet.cell_value(i,1))    

print (dev_comm[0])

len(dev_comm)
model = SentenceTransformer('LaBSE')
dev_embeddings = model.encode(dev_comm)

print (dev_embeddings.shape)

train_comm=[]
train_label = []
loc = '/content/mal_full_offensive_train.xlsx'
sheet = opensheet(loc,'mal_full_offensive_train')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    train_comm.append(cmmt)    
    train_label.append(sheet.cell_value(i,1)) 

print (train_comm[0])

train_embeddings = model.encode(train_comm)

from sklearn.preprocessing import LabelEncoder

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(dev_label)
dev_encoded_Y = encoder.transform(dev_label)
# convert integers to dummy variables (i.e. one hot encoded)
#dummy_y = np_utils.to_categorical(encoded_Y)

encoder = LabelEncoder()
encoder.fit(train_label)
train_encoded_Y = encoder.transform(train_label)

test_comm=[]
#test_label = []
loc = '/content/mal_full_offensive_test.xlsx'
sheet = opensheet(loc,'mal_full_offensive_test')
print(sheet.cell_value(0, 0) )
for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    test_comm.append(cmmt)    

print (test_comm[0])

test_embeddings = model.encode(test_comm)

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras.callbacks import EarlyStopping, ModelCheckpoint

# define baseline model
def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(100, input_dim=768, activation='relu'))
	model.add(Dense(5, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

import numpy as np
np.unique(dev_encoded_Y)
np.unique(train_encoded_Y)

from keras.utils import to_categorical
train_encoded_Y=to_categorical(train_encoded_Y)
dev_encoded_Y=to_categorical(dev_encoded_Y)

estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=5, verbose=1,validation_data=(dev_embeddings,dev_encoded_Y))

kfold = KFold(n_splits=10, shuffle=True)

results = cross_val_score(estimator, train_embeddings, train_encoded_Y, cv=kfold)
print("Baseline: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

estimator.fit(train_embeddings, train_encoded_Y)
predictions = estimator.predict(test_embeddings)
print(predictions)

labelpredictions = encoder.inverse_transform(predictions)
print (labelpredictions)

import numpy as np
import pandas as pd
prediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('/content/prediction.csv')
predictionclass = pd.DataFrame(labelpredictions,columns=['labelpredictions']).to_csv('/content/predictionlabel.csv')

"""For Kannada"""

import xlrd
dev_comm=[]
dev_label = []
loc = '/content/kannada_offensive_dev.xlsx'
sheet = opensheet(loc,'kannada_offensive_dev')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    dev_comm.append(cmmt)    
    dev_label.append(sheet.cell_value(i,1))    

print (dev_comm[0])

len(dev_comm)
model = SentenceTransformer('LaBSE')
dev_embeddings = model.encode(dev_comm)

train_comm=[]
train_label = []
loc = '/content/kannada_offensive_train.xlsx'
sheet = opensheet(loc,'kannada_offensive_train')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    train_comm.append(cmmt)    
    train_label.append(sheet.cell_value(i,1)) 

print (train_comm[0])

train_embeddings = model.encode(train_comm)

from sklearn.preprocessing import LabelEncoder
print (train_embeddings.shape)
print (dev_embeddings.shape)

encoder = LabelEncoder()
encoder.fit(dev_label)
dev_encoded_Y = encoder.transform(dev_label)

encoder = LabelEncoder()
encoder.fit(train_label)
train_encoded_Y = encoder.transform(train_label)

test_comm=[]
#test_label = []
loc = '/content/kannada_offensive_test.xlsx'
sheet = opensheet(loc,'kannada_offensive_test')
print(sheet.cell_value(0, 0) )
for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    test_comm.append(cmmt)    

print (test_comm[0])

test_embeddings = model.encode(test_comm)

import numpy as np
np.unique(dev_encoded_Y)
np.unique(train_encoded_Y)

from keras.utils import to_categorical
train_encoded_Y=to_categorical(train_encoded_Y)
dev_encoded_Y=to_categorical(dev_encoded_Y)

print (test_embeddings.shape)

def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(100, input_dim=768, activation='relu'))
	model.add(Dense(6, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=5, verbose=1,validation_data=(dev_embeddings,dev_encoded_Y))

estimator.fit(train_embeddings, train_encoded_Y)
predictions = estimator.predict(test_embeddings)
print(predictions)

labelpredictions = encoder.inverse_transform(predictions)
print (labelpredictions)

import numpy as np
import pandas as pd
kprediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('/content/kprediction.csv')
kpredictionclass = pd.DataFrame(labelpredictions,columns=['labelpredictions']).to_csv('/content/kpredictionlabel.csv')

"""For Tamil"""

import xlrd
dev_comm=[]
dev_label = []
loc = '/content/tamil_offensive_full_dev.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_dev')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    dev_comm.append(cmmt)    
    dev_label.append(sheet.cell_value(i,1))    

print (dev_comm[0])

len(dev_comm)
model = SentenceTransformer('LaBSE')
dev_embeddings = model.encode(dev_comm)

train_comm=[]
train_label = []
loc = '/content/tamil_offensive_full_train.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_train')

for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    train_comm.append(cmmt)    
    train_label.append(sheet.cell_value(i,1)) 

print (train_comm[0])

train_embeddings = model.encode(train_comm)

from sklearn.preprocessing import LabelEncoder
print (train_embeddings.shape)
print (dev_embeddings.shape)

encoder = LabelEncoder()
encoder.fit(dev_label)
dev_encoded_Y = encoder.transform(dev_label)

encoder = LabelEncoder()
encoder.fit(train_label)
train_encoded_Y = encoder.transform(train_label)

test_comm=[]
#test_label = []
loc = '/content/tamil_offensive_full_test.xlsx'
sheet = opensheet(loc,'tamil_offensive_full_test')
print(sheet.cell_value(0, 0) )
for i in range(0,sheet.nrows):
    cmmt = sheet.cell_value(i,0)
    test_comm.append(cmmt)    

print (test_comm[0])

test_embeddings = model.encode(test_comm)

import numpy as np
np.unique(dev_encoded_Y)
np.unique(train_encoded_Y)

from keras.utils import to_categorical
train_encoded_Y=to_categorical(train_encoded_Y)
dev_encoded_Y=to_categorical(dev_encoded_Y)

print (test_embeddings.shape)

def baseline_model():
	# create model
	model = Sequential()
	model.add(Dense(100, input_dim=768, activation='relu'))
	model.add(Dense(6, activation='softmax'))
	# Compile model
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model

estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=5, verbose=1,validation_data=(dev_embeddings,dev_encoded_Y),callbacks=callbacks)

estimator.fit(train_embeddings, train_encoded_Y)
predictions = estimator.predict(test_embeddings)
print(predictions)

labelpredictions = encoder.inverse_transform(predictions)
print (labelpredictions)

import numpy as np
import pandas as pd
tprediction = pd.DataFrame(predictions, columns=['predictions']).to_csv('/content/tprediction.csv')
tpredictionclass = pd.DataFrame(labelpredictions,columns=['labelpredictions']).to_csv('/content/tpredictionlabel.csv')

"""**Using** **MBERT**"""

from transformers import BertTokenizer, TFBertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = TFBertModel.from_pretrained("bert-base-multilingual-cased")
text1 = "Replace me by any text you'd like."
encoded_input = tokenizer(text1, return_tensors='tf')
output = model(encoded_input)
