#------------------------------------------------#
def get_meta(filename):
	
	meta = filename.split(".")
	register = meta[0]
	language = meta[1]
	country = meta[2]
	samples = meta[3]
	type = meta[4]
	feature = meta[5]
	
	return register, country, feature
	
#------------------------------------------------#
def get_vectors(language, type, register, feature):
	
	#Get vectors
	x_list = []
	y_list = []
	
	current_path = os.path.join(".", "vectors", language, type)
	for filename in os.listdir(current_path):
		
		current_register, current_country, current_feature = get_meta(filename)
			
		if current_register == register or register == "all":
			if current_feature == feature:
				
				with open(os.path.join(current_path, filename), "rb") as fo:
					x = pickle.load(fo)
					x_list.append(x)

					y = [current_country for x in range(x.shape[0])]
					y_list += y
					
	#Now stack
	x = sparse.vstack(x_list)
	#y = np.vstack(y_list)
	
	return x, y_list
#------------------------------------------------#

if __name__ == "__main__":

	import os
	import pickle
	import numpy as np
	from scipy import sparse
	from sklearn.svm import LinearSVC
	from sklearn.model_selection import GridSearchCV
	from sklearn.metrics import classification_report
	from sklearn.metrics import confusion_matrix

	language = "eng"
	features = ["unigrams", "bigrams", "trigrams", "function", "cxg1", "cxg2"]
	registers = ["cc", "twitter", "all"]

	for feature in features:
		for register in registers:
		
			check_name = "model." + language + "." + register + "." + feature + ".p"
			check_name = os.path.join(".", "models", language, check_name)
			
			if not os.path.isfile(check_name):
				print("Starting " + feature + " and " + register)
				
				#Get dev data
				dev_x, dev_y = get_vectors(language, "dev", register, feature)
				print(dev_x.shape, len(dev_y))
				
				if feature != "function":
				
					#GridSearch
					parameters = {"C": [0.00001, 0.0001, 0.001, 0.01, 1.0, 10.0, 100], 
									"loss": ["hinge", "squared_hinge"]
									}
					
					svc = LinearSVC(max_iter = 1000000)
					cls = GridSearchCV(svc, parameters, cv = 5, n_jobs = 4)
					cls.fit(dev_x, dev_y)
					
					#print(cls.cv_results_)
					parameters = cls.best_params_
					print(parameters)
				
				elif feature == "function":
					print("No grid search")
					parameters = {"C": 0.001, "loss": "squared_hinge"}
				
				#Get train/test sets
				train_x, train_y = get_vectors(language, "train", register, feature)
				test_x, test_y = get_vectors(language, "test", register, feature)
				
				#Initialize classifier
				svc = LinearSVC(max_iter = 1000000,
									C = parameters["C"], 
									loss = parameters["loss"]
									)
									
				svc.fit(train_x, train_y)
				
				with open(os.path.join(".", "models", language, "model." + language + "." + register + "." + feature + ".p", "wb") as fp:
					pickle.dump(svc, fp)
					
				test_predictions = svc.predict(test_x)
				
				current_report = classification_report(test_y, test_predictions, labels = None)
				current_matrix = confusion_matrix(test_y, test_predictions, labels = None)
				
				print(language, register, feature)
				print(current_report)
				print(current_matrix)