import numpy as np
import re
import pandas as pd
from tqdm import tqdm
import nltk
from unidecode import unidecode

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, RobertaTokenizerFast, BertTokenizerFast, AutoModel, AdamW

def remove_links(x):
    return re.sub(r"((http|ftp|https):\/\/)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", x)

def preproc(text):
    text = re.sub('\\t+', '', text)
    text = re.sub('\d+\. ', '', text)
    ends = re.compile('[?!.]')
    text = re.sub(ends, " . ", text)
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    final = [i for i in text.split() if i[0]!='@']
    text = ' '.join(final)
    text = re.sub(r"\s+", " ", text)
    return text

def tokenize_txt(txt):
    stop_words = set(stopwords.words('english'))
    #only take words or numbers in
    tokenizer = RegexpTokenizer(r'[a-zA-Z]+', gaps=False)
    tokens = tokenizer.tokenize(txt)
    tokens = [word.lower() for word in tokens if word.lower() not in stop_words]
    return tokens

def lemmatization(data):
    lemmatizer = WordNetLemmatizer()
    lemmatized_text = [lemmatizer.lemmatize(word) for word in data]
    return ' '.join(lemmatized_text)

hope_train = pd.read_csv("./hope_train.csv", header=None)
# print(hope_train[0])
data = pd.DataFrame(hope_train)
data.columns = ['text', 'tag']
data.drop(data[data['tag']=='not-English'].index, inplace=True)
data['text'] = data['text'].apply(lambda x: remove_links(x))
data['text'] = data['text'].apply(lambda x: preproc(x))
data['tag'] = data['tag'].apply(lambda x: 1 if (x=='Hope_speech') else 0)
print("Train preprocessed")
# print(data['text'].iloc[1])
# data.head()

X = data['text']
y = data['tag']
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.8, stratify=y, random_state=42)

hope_dev = pd.read_csv("./hope_dev.csv", header=None)
# print(hope_train[0])
data = pd.DataFrame(hope_dev)
data.columns = ['text', 'tag']
data.drop(data[data['tag']=='not-English'].index, inplace=True)
data['text'] = data['text'].apply(lambda x: remove_links(x))
data['text'] = data['text'].apply(lambda x: preproc(x))
data['tag'] = data['tag'].apply(lambda x: 1 if (x=='Hope_speech') else 0)
# print(data['text'].iloc[1])
print("Val preprocessed")

X_val = data['text']
y_val = data['tag']

f1 = open("train_data.txt", "w")
for i, line in enumerate(X_train):
    line = unidecode(line)
    ends = re.compile('[?!.]')
    line = re.sub(ends, " . ", line)
    if i==0:
        f1.write(line)
    else:
        f1.write("\n"+line)
f1.close()

f2 = open("val_data.txt", "w")
for i, line in enumerate(X_val):
    line = unidecode(line)
    ends = re.compile('[?!.]')
    line = re.sub(ends, " . ", line)
    if i==0:
        f2.write(line)
    else:
        f2.write("\n"+line)
f2.close()