import pandas as pd
import numpy as np
from ekphrasis.classes.spellcorrect import SpellCorrector
from ekphrasis.classes.segmenter import Segmenter

class DataProcessing:
    def __init__(self, modelFile, dataFile, emojiFile, numDimensions, eab=0):
        print('Loading Embeddings...')
        self.numDimensions = numDimensions
        self.emoji_numDimensions = numDimensions
        self.vocab, embd = self.getVecFromLocal(modelFile, emojiFile)
        self.vocab_size = len(self.vocab)
        self.embedding = np.asarray(embd)
        self.eab = eab
        self.emoji_vocab, emoji_embd = self.getVecFromEmoji(emojiFile)
        if eab != 0:
            self.emoji_vocab_size = len(self.emoji_vocab)
            self.emoji_embedding = np.asarray(emoji_embd)
        print('Embeddings loaded.')

        print('Loading Test Data...')
        self.seg = Segmenter(corpus="twitter")
        self.spc = SpellCorrector(corpus="twitter")
        self.test_id, self.test_sent, self.test_sent_emoji = self.getSentences_eab(dataFile)
        self.sent_leng = len(self.test_sent)
        print('Train Test loaded.')

    def getVecFromLocal(self, modelFile, emojiFile):
        vocab = {}
        embd = np.zeros((1195098, self.numDimensions))
        vocab['__nok__'] = 0
        vocab['__unk__'] = 1
        vocab['__num__'] = 2
        vocab['__date__'] = 3
        embd[1] = 1 - 2 * np.random.random(self.numDimensions)
        embd[2] = 1 - 2 * np.random.random(self.numDimensions)
        embd[3] = 1 - 2 * np.random.random(self.numDimensions)
        embd[4] = 1 - 2 * np.random.random(self.numDimensions)
        with open(modelFile, "rb") as f:
            i = 4
            for line in f.readlines():
                line = line.decode(encoding="utf-8").split(' ')
                vocab[line[0]] = i
                embd[i] = line[1:]
                i += 1
            f.close()
        with open(emojiFile, "rb") as f:
            for line in f.readlines():
                line = line.decode(encoding="utf-8").replace(" \r\n", "")
                line = line.split(' ')
                try:
                    vocab[line[0]]
                except KeyError:
                    vocab[line[0]] = i
                    embd[i] = line[1:]
                    i += 1
            f.close()
        return vocab, embd

    def getVecFromEmoji(self, modelFile):
        vocab = {}
        embd = np.zeros((1661 + 1, self.emoji_numDimensions))
        vocab['__nok__'] = 0
        with open(modelFile, "rb") as f:
            i = 1
            for line in f.readlines():
                line = line.decode(encoding="utf-8").replace(" \r\n", "")
                line = line.split(' ')
                vocab[line[0]] = i
                embd[i] = line[1:]
                i += 1
                if i == 1661 + 1:
                    break
            f.close()
        return vocab, embd

    def getVecFromSent(self, sentence):
        ids = []
        words = sentence.split(' ')
        for i in range(len(words)):
            try:
                if words[i] == ' ' or words[i] == '':
                    continue
                ids.append(self.vocab[words[i]])
            except KeyError:
                ids = self.getVecFromSent_unkWordPro1(words[i], ids)
        return ids

    def getVecFromSent_unkWordPro1(self, word, ids):
        word = word.replace("'", "")
        word = word.replace("‘", "")
        word = word.replace("’", "")
        word = self.seg.segment(word)
        words = word.split(' ')
        for i in range(len(words)):
            try:
                if words[i] == ' ' or words[i] == '':
                    continue
                ids.append(self.vocab[words[i]])
            except KeyError:
                ids = self.getVecFromSent_unkWordPro2(words[i], ids)
        return ids

    def getVecFromSent_unkWordPro2(self, word, ids):
        word = self.spc.correct(word)
        try:
            ids.append(self.vocab[word])
        except KeyError:
            if word.isdigit() == True:
                if int(word) >= 1900 and int(word) <= 2020:
                    ids.append(4)
                else:
                    ids.append(3)
            else:
                ids.append(1)
        return ids

    def prePro_1(self, sentence):
        sentence = sentence.lower()
        sentence = sentence.replace("@user", " <user> ")
        sentence = sentence.replace("'m", " 'm")
        sentence = sentence.replace("’m", " ’m")
        sentence = sentence.replace("'re", " 're")
        sentence = sentence.replace("’re", " ’re")
        sentence = sentence.replace("'ll", " 'll")
        sentence = sentence.replace("’ll", " ’ll")
        sentence = sentence.replace("'ve", " 've")
        sentence = sentence.replace("’ve", " ’ve")
        sentence = sentence.replace("'d", " 'd")
        sentence = sentence.replace("’d", " ’d")
        sentence = sentence.replace("n't", " n't")
        sentence = sentence.replace("n’t", " n’t")
        sentence = sentence.replace("'s", " 's")
        sentence = sentence.replace("’s", " ’s")
        sentence = sentence.replace("s'", " s'")
        sentence = sentence.replace("s’", " s’")
        sentence = sentence.replace(".", " . ")
        sentence = sentence.replace(",", " , ")
        sentence = sentence.replace("!", " ! ")
        sentence = sentence.replace("！", " ！ ")
        sentence = sentence.replace("?", " ? ")
        sentence = sentence.replace("？", " ？ ")
        sentence = sentence.replace("#", " # ")
        sentence = sentence.replace("$", " $ ")
        sentence = sentence.replace("\"", " \" ")
        sentence = sentence.replace("”", " ” ")
        sentence = sentence.replace("“", " “ ")
        sentence = sentence.replace(";", " ; ")
        sentence = sentence.replace(":", " : ")
        sentence = sentence.replace("(", " ( ")
        sentence = sentence.replace(")", " ) ")
        sentence = sentence.replace("[", " [ ")
        sentence = sentence.replace("]", " ] ")
        sentence = sentence.replace("{", " { ")
        sentence = sentence.replace("}", " } ")
        sentence = sentence.replace("+", " + ")
        sentence = sentence.replace("-", " - ")
        sentence = sentence.replace("*", " * ")
        sentence = sentence.replace("/", " / ")
        sentence = sentence.replace("=", " = ")
        sentence = sentence.replace("—", " — ")
        sentence = sentence.replace("…", " … ")
        sentence = sentence.replace("•", " • ")
        sentence = sentence.replace(" ", " ")
        sentence = sentence.replace("🏻", " ")
        sentence = sentence.replace("🏼", " ")
        sentence = sentence.replace("🏽", " ")
        sentence = sentence.replace("🏾", " ")
        sentence = sentence.replace("♂", " ")
        sentence = sentence.replace("♀", " ")
        return sentence

    def prePro_2_delEmoji(self, sentence):
        for key in self.emoji_vocab:
            sentence = sentence.replace(key, " ")
        return sentence

    def prePro_2_attEmoji(self, sentence, getEmoji=True):
        emoji_encode_list = []
        for key in self.emoji_vocab:
            if sentence.find(key) >= 0:
                emoji_encode_list.append(self.emoji_vocab[key])
                # sentence = sentence.replace(key, " ")
                sentence = sentence.replace(key, " " + key + " ")
        if getEmoji == False:
            return sentence
        return sentence, emoji_encode_list

    def getSentences_eab(self, file):
        test = pd.read_csv(file, sep='\t', header=0)
        test_id = []
        test_sent = []
        test_sent_emoji = []
        for i in range(len(test)):
            sentence = str(test['tweet'][i])
            sentence = self.prePro_1(sentence)
            sentence, sent_emoji_encode_list = self.prePro_2_attEmoji(sentence)
            test_id.append(str(test['id'][i]))
            test_sent.append(sentence)
            test_sent_emoji.append(sent_emoji_encode_list)
        return test_id, test_sent, test_sent_emoji

    def verPro(self, x):
        maxWordNumOfBatch = 0
        for sentence in x:
            if maxWordNumOfBatch < len(sentence):
                maxWordNumOfBatch = len(sentence)
        for sentenceIndex in range(len(x)):
            if len(x[sentenceIndex]) < maxWordNumOfBatch:
                for i in range(maxWordNumOfBatch - len(x[sentenceIndex])):
                    x[sentenceIndex].append(0)
        x = np.array(x)
        return x

    def getTestData(self):
        ids = []
        inputs = []
        emojis = []
        for i in range(self.sent_leng):
            ids.append(self.test_id[i])
            inputs.append(self.getVecFromSent(self.test_sent[i]))
            emojis.append(self.test_sent_emoji[i])
        inputs = self.verPro(inputs)
        emojis = self.verPro(emojis)
        return ids, inputs, emojis

# wordModel = DataProcessing(
#         'D:/SemEval2019-Task6/workspace/wordModel/glove.twitter.27B.200d.txt',
#         'D:/SemEval2019-Task6/workspace/data/test/testset-taskb.tsv',
#         'D:/SemEval2019-Task6/workspace/wordModel/emoji2vec200.txt',
#         200, eab=2)
# ids, inputs, emojis = wordModel.getTestData()
# print(ids[0:5])
# print(inputs[0:5])
# print(emojis[0:5])
