import collections
import os
import re
import _pickle
import numpy as np
import pandas as pd
import time
import nltk
import csv
import stanfordcorenlp

#config = tf.ConfigProto()
#config.gpu_options.per_process_gpu_memory_fraction = 0


"""
df = {'text': texts, 'tokens': tokenized_texts, 'indexed': indexed_texts, 'len': lengths,
                  'label': labels})
"""

def get_maxlen(df):
    ls = df['len'].tolist()
    return max(ls)


def load_glove(gloveFile,dic,min_cnt=5):
    """
    :param gloveFile:
    :param dic:
    :param min_cnt:
    :return:
    """
    f = open(gloveFile,'rb')
    words = {}
    n_words = set_oov_index(dic,min_cnt)
    for line in f:
        splitLine = line.decode().split(' ')
        word = splitLine[0]
        if word in dic.keys():
            if dic[word]['cnt'] > min_cnt:
                try:
                    vector = [float(val) for val in splitLine[1:]]
                    if word in words.keys():
                        print(word)
                    words[word] = vector
                except:
                    print(line)
                    print(splitLine)
                    print('sibal')
    vectors = np.zeros((n_words+2,300))
    for word in words:
        vectors[dic[word]['index']] = words[word]
    else:
        vectors[dic[word]['index']] = np.random.normal(0,0.4,300)
    return vectors


def set_oov_index(dic,min_cnt):
    cnt = 0
    for word in dic:
        if dic[word]['cnt'] > min_cnt:
            cnt+=1
    return cnt


def get_relative_path(path):
    filenames = []
    filepaths = []
    for root, dirs, files in os.walk(path):
        for file in files:
            filepath = os.path.join(root,file)
            filepaths.append(filepath)
            filenames.append(file)
    return filenames,filepaths


def save_file(filename,to_save):
    with open(filename,'wb') as f:
        _pickle.dump(to_save,f,protocol = 4)


def save_hdf(df,filename,key):
    df.to_hdf(filename,key)


def load_hdf(filename,key):
    return pd.read_hdf(filename,key)


def load_file(filename):
    with open(filename,'rb') as f:
        return _pickle.load(f)


'''나중에 spacy 사용법 익혀서 다시
def tokenize(text,replace_NR=True):
    if 'nlp' not in dir():
        nlp = spacy.load('en')
    doc = nlp(text)
    if replace_NR:
        to_return = []
        temp = [word.ent_type_ if word.ent_type_ else  word.string.lower()  for word in doc]
        for idx, i in enumerate(temp):
            if idx == len(temp)-1:
                break
            if i.isupper() and temp[idx + 1] == i:
                continue
            to_return.append(i)
        return to_return, len(to_return)
    else:
        to_return = list(doc)
        return to_return, len(to_return)

def tokenize_hierarchy(text,replace_NR=True):
    if 'nlp' not in dir():
        nlp = spacy.load('en')
    doc = nlp(text)
    lengths = []
    if replace_NR:
        to_returns = []
        for sentence in doc.sents:
            to_return = []
            temp = [word.ent_type_ if word.ent_type_ else  word.string.lower() for word in sentence]
            for idx, i in enumerate(temp):
                if idx == len(temp)-1:
                    break
                if i.isupper() and temp[idx + 1] == i:
                    continue
                to_return.append(i)
            to_returns.append(to_return)
            lengths.append(len(to_return))
        return to_returns,lengths
    else:
        to_returns = []
        for sentence in doc.sents:
            to_return = list(sentence)
            to_returns.append(to_return)
            lengths.append(len(to_return))
        return to_returns,lengths
'''


def tokenize(text,tokenizer):
    try:
        tokenized = tokenizer.word_tokenize(text)
    except:
        print('exception')
        return None
    return tokenized


def tokenize_hierarchy(text,tokenizer):
    tokens = []
    try:
        for sent in nltk.sent_tokenize(text):
            #toks = tokenizer.tokenize(sent)
            # toks = sent.split()
            toks = tokenizer.word_tokenize(sent)
            tokens.append(toks)
    except:
        print('exception')
        return None
    return tokens


def get_lengths(tokenized_text):
    level = count_nests(tokenized_text)
    if level == 1:
        return len(tokenized_text)
    else:
        return [get_lengths(i) for i in tokenized_text]


def get_char_lengths(tokenized_text):
    char_lengths = []
    for token in tokenized_text:
        char_lengths.append(len(token))
    return char_lengths


def read_data_imdb(directory):
    filenames=[]
    texts = []
    tokenized_texts = []
    labels = []
    lengths = []
    tokenizer = stanfordcorenlp.StanfordCoreNLP(r'/home/choi/hdd/stanford-corenlp-full-2017-06-09')
    for path,directory,files in os.walk(directory):
        for file in files:
            filenames.append(os.path.join(path,file))
    for filename in filenames:
        last_folder = get_last_folder_name(filename)
        if last_folder in ['neg','pos']:
            text = open(filename,'rb').readline().decode()
            text = text.lower().strip()
            processed_text = preprocess(text)
            tokenized_text = tokenize_hierarchy(processed_text,tokenizer)
            if tokenized_text:
                texts.append(text)
                length = get_lengths(tokenized_text)
                tokenized_texts.append(tokenized_text)
                lengths.append(length)
                if 'neg' in filename:
                    labels.append(0)
                else:
                    labels.append(1)

    return texts,tokenized_texts,lengths,labels

def get_last_folder_name(filename):
    to_lastfolder = filename[:filename.rfind('/')]
    return to_lastfolder[to_lastfolder.rfind('/')+1:]

def build_imdb_df(folder,min_cnt=5):
    train_folder = folder+'/train'
    test_folder = folder+'/test'

    tr_texts, tr_tokenized, tr_lengths, tr_labels = read_data_imdb(train_folder)
    te_texts, te_tokenized, te_lengths, te_labels = read_data_imdb(test_folder)
    tr_indexed, real_count, dictionary, inv_wordidx =index_texts_hierarchy(tr_tokenized, None, min_cnt)
    te_indexed = index_texts_hierarchy(te_tokenized,dictionary,min_cnt)
    tr_df = build_df(tr_texts, tr_tokenized, tr_indexed, tr_lengths, tr_labels)
    te_df = build_df(te_texts, te_tokenized, te_indexed, te_lengths, te_labels)
    return tr_df, te_df, real_count, dictionary



def read_from_csv(filename,replace_NR=False,hierarchy=False,remove_article=True):
    """
    :param filename: file의 첫쨰 열은 id, 둘쨰 이상 열은 text
    :param replace_NR:
    :param hierarchy:
    :return:
    """
    print('read from csv starts:', filename)
    csv = pd.read_csv(filename, header=None,encoding='utf-8')
    labels = list(np.array(csv[0].tolist(),dtype=np.int) - 1)
    print('lengths of file :',len(csv))
    maxlen = 0
    maxsentlen = 0
    texts = csv[1].tolist()
    texts = [text.strip().lower() for text in texts]
    # tokenizer = stanfordcorenlp.StanfordCoreNLP(r'/home/choi/hdd/stanford-corenlp-full-2017-06-09')
    tokenizer = stanfordcorenlp.StanfordCoreNLP(r'E:\stanford-corenlp-full-2017-06-09')
    print('tokenizer loaded')
    '''texts = []
    for i in range(len(csv)):
        text = csv.ix[i][1:]
        temp = ''
        for line in text:
            if isinstance(line,str):
                temp += ' ' + line.lower()
        texts.append(temp)'''
    tokenized_texts =[]
    lengths = []
    # refined_texts = []
    refined_labels = []
    sentence_sum = 0
    word_sum = 0
    cnt = 0
    if hierarchy:
        for idx,text in enumerate(texts):
            tokenized_text = tokenize_hierarchy(preprocess(text), tokenizer)
            if tokenized_text:
                if remove_article:
                    tokenized_text = remove_articles(tokenized_text)
                tokenized_text = remove_empty_list(tokenized_text)
                if tokenized_text:
                    cnt +=1
                    length = get_lengths(tokenized_text)
                    lengths.append(length)
                    sentence_sum += len(length)
                    word_sum +=sum(length)
                    if len(length) > maxsentlen:
                        maxsentlen = len(length)
                    if sum(length) > maxlen:
                        maxlen = sum(length)
                    tokenized_texts.append(tokenized_text)
                    # refined_texts.append(text)
                    refined_labels.append(labels[idx])
        print('avg s len',sentence_sum / cnt)
        print('avg w len',word_sum / cnt)
        print('max len :', maxlen)
        print('max sent len :', maxsentlen)
    else:
        for idx, text in enumerate(texts):
            tokenized_text = tokenize(preprocess(text),tokenizer)
            if tokenized_text:
                if remove_article:
                    tokenized_text = remove_articles(tokenized_text)
                tokenized_text = remove_empty_list(tokenized_text)
                if tokenized_text:
                    tokenized_text = [replace_bracket(word) for word in tokenized_text]
                    tokenized_texts.append(tokenized_text)
                    length = get_lengths(tokenized_text)
                    if length > maxlen:
                        maxlen = length
                    lengths.append(length)
                    # refined_texts.append(text)
                    refined_labels.append(labels[idx])
    print('read from csv finished :',filename)

    return tokenized_texts,lengths,refined_labels


def remove_articles(tokenized_text):
    nets = count_nests(tokenized_text)
    articles = ['a', 'an', 'the']
    removed_texts = []
    if nets == 2:
        for sent in tokenized_text:
            removed_text = [i for i in sent if i not in articles]
            removed_texts.append(removed_text)
    else:
        removed_texts = [i for i in tokenized_text if i not in articles]

    return removed_texts


def remove_empty_list(lists):
    if [] in lists:
        return lists.remove([])
    else:
        return lists


def preprocess(text):   ## raw text에서 잡다한 것들 지워줌
    #temp = re.sub('<br />', ' ', text)
    temp = re.sub(r'\\n',' ',text)
    #temp = re.sub(r'\\','',temp)
    #temp = re.sub('''http(s?)://.+''', 'WEBPAGE', temp)
    temp = re.sub("<(/)?([a-zA-Z]*)(\\s[a-zA-Z]*=[^>]*)?(\\s)*(/)?>", "",temp)
    temp = re.sub('[.]{2,}', '..', temp)
    temp = re.sub('[!]{2,}', '!!', temp)
    temp = re.sub('[-]{2,}','--',temp)
    temp = re.sub('[?]{2,}','??',temp)
    temp = re.sub('[_]{2,}', '__', temp)

    # temp = re.sub("(\w{2})(/)(\w)", r'\1 / \3', temp)
    # temp = re.sub("/", r' / ', temp)
    # temp = re.sub("/  /", r'//', temp)
    # temp = re.sub("(\w{2})(-)(\w)", r'\1 - \3', temp)
    # temp = re.sub("-", r' - ', temp)
    # temp = re.sub("-  -", r'--', temp)
    # temp = re.sub('\d+.*\d','NUMBER',temp)
    #print(text)
    #print(temp)
    return temp


def word_chars_index(total_wordidx, char_dict):
    """
    :param word: total_word_idx {'word' : index }
    :param char_dict: dictionary
    :return: dict ( wordidx : char_indices )
    """
    wordidx_to_charidx = dict()
    for word in total_wordidx:
        char_indices = []
        word_t = word.replace('-LRB-', '(')
        word_t = word_t.replace('-RRB-', ')')
        for char in word_t:
            if char in char_dict:
                char_indices.append(char_dict[char])
            else:
                char_indices.append(len(char_dict))
        wordidx_to_charidx[total_wordidx[word]] = char_indices

    return wordidx_to_charidx


def replace_bracket(word):
    word = word.replace('-LRB-', '(')
    word = word.replace('-RRB-', ')')
    return word

def index_texts(tokenized_texts, word_dictionary=None):
    if word_dictionary == None:
        count=[]
        cnter = collections.Counter()
        for text in tokenized_texts:
            cnter.update(text)
        count.extend(cnter.most_common())
        word_dictionary = dict()
        for word,cnt in count:
            word_dictionary[word] = {'index':len(word_dictionary), 'cnt':cnt}
    indexed_texts, word_dictionary = __index(tokenized_texts,word_dictionary)
    return indexed_texts, word_dictionary


def __index(tokenized_texts,dictionary):
    indexed_texts = []
    for text in tokenized_texts:
        indexed_text = []
        for word in text:
            if word not in dictionary:
                dictionary[word]={'index':len(dictionary), 'cnt':0}
            indexed_text.append(dictionary[word]['index'])
        indexed_texts.append(indexed_text)
    return indexed_texts, dictionary


def index_texts_hierarchy(tokenized_texts,dictionary=None,min_cnt=5):
    if dictionary == None:  #word dict 생성
        dictionary = dict()
        count=[]
        unk_cnt = 0
        real_count = []
        cnter = collections.Counter()
        for text in tokenized_texts:
            for sentence in text:
                cnter.update(sentence)
        count.extend(cnter.most_common())
        for word, count in count:
            if count>min_cnt:
                real_count.append([word,count])
            else:
                unk_cnt+=count
        real_count.append(['UNK',unk_cnt])
        for word,_ in real_count:
            dictionary[word] = len(dictionary)
        inv_wordidx = dict(zip(dictionary.values(), dictionary.keys()))
        indexed_texts = __index_hierarchy(tokenized_texts,dictionary)
        return indexed_texts, real_count, dictionary, inv_wordidx
    else :
        indexed_texts = __index_hierarchy(tokenized_texts,dictionary)
        return indexed_texts


def __index_hierarchy(tokenized_texts,dictionary):
    indexed_texts=[]
    vocab_size = len(dictionary)
    for text in tokenized_texts:
        indexed_text=[]
        for sentence in text:
            indexed_sentence = []
            for word in sentence:
                if word in dictionary.keys():
                    indexed_sentence.append(dictionary[word])
                else:
                    indexed_sentence.append(vocab_size-1)
            indexed_text.append(indexed_sentence)
        indexed_texts.append(indexed_text)
    return indexed_texts


def build_datasets_csv(filename, dictionary=None, hierarchy=False, replace_NR=False, min_cnt = 5, remove_article=False):
    tokenized_texts, text_lengths, labels = read_from_csv(filename,replace_NR,hierarchy,remove_article=remove_article)
    print('indexing starts')
    if dictionary==None:
        if hierarchy:
            indexed_texts, real_count, wordidx, inv_wordidx = index_texts_hierarchy(tokenized_texts,dictionary,min_cnt)
        else:
            indexed_texts, wordidx = index_texts(tokenized_texts,dictionary)
    else:
        if hierarchy:
            indexed_texts = index_texts_hierarchy(tokenized_texts,dictionary,min_cnt)
        else:
            indexed_texts, wordidx = index_texts(tokenized_texts,dictionary)

    return build_df(indexed_texts,text_lengths,labels), wordidx


def build_df(indexed_texts,lengths,labels):
    # return pd.DataFrame({'text': texts, 'tokens': tokenized_texts, 'indexed': indexed_texts, 'len': lengths,
    #               'label': labels})
    return pd.DataFrame({'indexed': indexed_texts, 'len': lengths, 'label': labels})


def list_unroll(lists):  #list 속에 list 있는거 다 풀어놓는 함수
    unrolled = []
    if isinstance(lists[0],list):
        for i in lists:
            unrolled.extend(list_unroll(i))
    else:
        unrolled.extend(lists)
    return unrolled


def list_unroll_but_last(lists):  #list 속에 list 있는거 다 풀어놓는 함수
    unrolled = []
    if isinstance(lists[0],list):
        for i in lists:
            unrolled.extend(list_unroll_but_last(i))
    else:
        unrolled = [lists]
    return unrolled


def count_nests(lists):  #list 중첩 갯수 셈
    if not isinstance(lists,list):
        return 0
    cnt = 1
    if isinstance(lists[0],list):
        cnt += count_nests(lists[0])
    return cnt


def is_empty(text):
    return len(text) ==0


def batch_shape(lists):  # 각 차원 별로 max dimension 계산
    temp = [lists]
    maxshape = []
    test = temp[0]
    while isinstance(test,list):
        maxshape.append(max([len(i) for i in temp]))
        temp2 = []
        for i in temp:
            temp2.extend(i)
        temp = temp2
        test = temp[0]
    return maxshape


def sub_array(index,shape,padding_idx):         # 중접 list를 numpy array로 변환
    # print(index)
    array = padding_idx*np.ones(shape,dtype=np.int32)
    if len(shape) != 1:
        for i,x_i in enumerate(index):
            x_i = sub_array(index[i],shape[1:],padding_idx)
            array[i]=x_i
    else:
        array[:len(index)] = index
    return array


def padded_sub_array(index,shape,padding_idx):         # 중접 list를 numpy array로 변환
    array = padding_idx*np.ones(shape,dtype=np.int32)
    if len(shape) != 1:
        for i,x_i in enumerate(index):
            x_i = padded_sub_array(index[i],shape[1:],padding_idx)
            array[i]=x_i
    else:
        array[:len(index)] = index
    return array


def hierarchy_lengths(lengths_list):
    """
    object의 hierarchy 에서, 각 단위별로  hierarchy의 lengths를 계산
    :param lengths_list: 
    :return: 
    """
    lengths= []
    shape = batch_shape(lengths_list)
    n_hierarchy = len(shape)
    lengths.append(sub_array(lengths_list,shape))
    temp = lengths_list
    stemp = shape[:-1]
    while n_hierarchy != 1:
        bottom_lengths = get_lengths_bottom(temp)
        lengths.append(sub_array(bottom_lengths,stemp))
        n_hierarchy -= 1
        stemp= stemp[:-1]
        temp = bottom_lengths
    return lengths


def get_lengths_bottom(nested_list):   # 가장 안쪽에 있는 list들의 길이를 계산해서 올려줌
    to_test = nested_list
    res = []
    if isinstance(to_test[0][0],list):
        for i in to_test:
            res.append(get_lengths_bottom(i))
    else:
        res = [len(i) for i in to_test]
    return res


def embedding_matrix(embedding_dic,dictionary):
    """

    :param embedding_dic: {word : embedding (type : list)
    :param dictionary: {word : id }
    :return:
    """
    embedding_size = 0
    vocabulary_size = len(dictionary)
    for i in embedding_dic.values():
        embedding_size = len(i)
        break
    embedding_mat = np.zeros([vocabulary_size,embedding_size])
    for word in dictionary.keys():
        embedding_mat[dictionary[word]] = embedding_dic[word]
    return embedding_mat


def save_embedding(embedding,dictionary,filename):
    """
    save embedding as pickle
    {word(user) [vector]}
    """
    inv_dic = dict(zip(dictionary.values(),dictionary.keys()))
    res = {}
    for idx,vec in enumerate(embedding):
        res[inv_dic[idx]]=vec.tolist()
    save_file(filename,res)


def build_test_train_csv(train_filename,test_filename,segment_level=10,replace_NR=False,hierarchy=False,remove_article=False):   #test셋 train셋 불러와서 저장함, dictionary도
    df, wordidx = build_datasets_csv(train_filename, dictionary=None, replace_NR=replace_NR,
                                                              hierarchy=hierarchy,remove_article=remove_article)
    save_file('train.pkl', df)
    del df
    df2, wordidx = build_datasets_csv(test_filename, dictionary=wordidx, replace_NR=replace_NR, hierarchy=hierarchy,remove_article=remove_article)

    save_file('test.pkl',df2)

    #save_hdf(df, 'train.h5', 'keys')
    #save_hdf(df2, 'test.h5', 'keys')
    #devided_save(df,'train',segment_level)
    #devided_save(df2,'test',1)

    save_file('dictionary.pkl', wordidx)


def unroll_csv(filename):
    csv.field_size_limit(100000000)
    def _concat_text(cr):
        values = []
        for i in cr:
            temp = [i[0]]
            text = ''
            for sent in i[1:]:
                text += ' '+sent
            temp.append(text)
            values.append(temp)
        return values
    file_to_write = filename[:-4]+'_concated'+filename[-4:]
    with open(filename,encoding='utf-8') as f:
        cr = csv.reader(f,delimiter=',')
        res = _concat_text(cr)
    filtered = []
    for i in res:
        filtered.append(i)
    with open(file_to_write,'w',encoding='utf-8',newline='') as f:
        cw = csv.writer(f,delimiter=',')
        cw.writerows(filtered)


def save_topk(filename,topk=100):
    file_to_write = filename[:-4] + '_topk' + filename[-4:]


def devided_save(df,savename,devide_num=10):
    cnt = 1
    df_len = len(df)
    data_per_chunk = df_len//devide_num
    for i in range(devide_num):
        if i !=devide_num -1:
            temp_df = df[i*data_per_chunk:(i+1)*data_per_chunk]
        else:
            temp_df = df[i*data_per_chunk:]
        save_name = savename+str(cnt)+'.h5'
        temp_df.to_hdf(save_name,'keys')
        cnt += 1


def save_log(filename,log):
    with open(filename,'a') as f:
        f.write(log)


def df_to_count(df,dic):
    vocab_size = len(dic)
    print(vocab_size)
    count = [[i, 0] for i in range(vocab_size)]
    for i in range(len(df)):
        temp = list_unroll(df.ix[i]['indexed'])
        for word in temp:
            try:
                count[word][1]+=1
            except:
                print(word)
    return count

def truncate_datasets(df,filename,max_len=5000):
    for idx,i in df.iterrows():
        total_len = 0
        for idx, sent_len in enumerate(i['len']):
            total_len+=sent_len
            if total_len > max_len:
                print(total_len)
                i['indexed'] = i['indexed'][:idx]
                i['len'] = i['len'][:idx]
    save_file(filename,df)

def add_chars(filename):
    char_dic = __make_char_dic()
    df = load_file(filename)
    wc_indexed = []
    for _,i in df.iterrows():
        tokens = i['tokens']
        text_indexed = []
        for sent in tokens:
            sent_indexed = []
            for token in sent:
                token = token.replace('-LRB-','(')
                token = token.replace('-RRB-',')')
                word_chars = []
                for char in token:
                    if char in char_dic:
                        word_chars.append(char_dic[char])
                    else:
                        print(token)
                        word_chars.append(len(char_dic))
                sent_indexed.append(word_chars)
            text_indexed.append(sent_indexed)
        wc_indexed.append(text_indexed)
    df['chars_indexed'] = wc_indexed
    save_file(filename+str('t'),df)

def pad_sow(charindexes,sow,eow):
    """

    :param charindexes: [char indesed]
    :return:
    """
    print(charindexes)
    charindexes.append(eow)
    charindexes.insert(0,sow)
    return charindexes


def __make_char_dic():
    chars = """abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'"/|\\_@#$%^&*~`+=<>()[]{}"""
    char_dic = {}
    for i in chars:
    	char_dic[i] = len(char_dic)
    return char_dic

def inverse_dictionary(dictionary):
    """

    :param dictionary: { word : {'indexed': , 'cnt':}
    :return:
    """
    inv = {}
    for word in dictionary.keys():
        inv[dictionary[word]['index']] = [word, dictionary[word]['cnt']]
    return inv

if __name__ =='__main__':
    t = time.time()
    # test_filename = 'test.pkl'
    # train_filename = 'train.pkl'
    # te = load_file(test_filename)
    # tr = load_file(train_filename)
    # truncate_datasets(te,'test_truncated.pkl')
    # truncate_datasets(tr,'train_truncated.pkl')
    # folder = 'data/imdb'
    # tr_df, te_df, real_count, dictionary = build_imdb_df(folder)
    # save_file('imdb_tr.pkl',tr_df)
    # save_file('imdb_te.pkl',te_df)
    # save_file('imdb_cnt.pkl',real_count)
    # save_file('imdb_dic.pkl',dictionary)
    # print('time spent : ',time.time()-t)
    #texts, tokenized_texts, lengths, labels = read_from_csv(temp_filename,hierarchy=True)
    # t = time.time()
    # print(os.getcwd())

    train_filename = r'train.csv'
    test_filename = r'test.csv'
    # # train_pd = r'train.pkl'
    # # dic_filename = r'dictionary.pickle'
    # # te = build_datasets_csv(test_filename,load_file(dic_filename),True)
    # # save_file('test.pkl', te)
    unroll_csv(train_filename)
    unroll_csv(test_filename)
    train_filename = r'train_concated.csv'
    test_filename = r'test_concated.csv'
    build_test_train_csv(train_filename,test_filename,1,remove_article=False)
    print('time spent : ',time.time()-t)
    # tr = load_file('train.pkl')
    # te = load_file('test.pkl')
    # trdf = pd.DataFrame({'indexed': tr['indexed'], 'len': tr['len'], 'label': tr['label']})
    # tedf = pd.DataFrame({'indexed': te['indexed'], 'len': te['len'], 'label': te['label']})
    # save_file('train.pkl',trdf)
    # save_file('test.pkl',tedf)

    #
    # df_filename = r'D:\workspace\data\text_classification\ag\new\train.pkl'
    #
    # dictionary = load_file(r'D:\workspace\data\text_classification\ag\new\dictionary.pickle')
    # inv_dic = dict(zip(dictionary.values(),dictionary.keys()))
    # add_chars(df_filename)
