# data_ops.py
# This file handles some general-purpose data loading/cleaning operations


# Internal Imports

# External Imports
import json
import os
import re
import string


'''
----------load_data----------
- This function handles loading in the raw text data from a properly-formatted file
-----Inputs-----
- file_path - The path to the file that needs to be read
-----Output-----
- text - The complete string of text data from the file
'''
def load_data(file_path):
    text = ""
    with open(file_path) as input_file:
        if file_path.endswith(".json"):
            raw_data = json.load(input_file)
            # For every item in the json object, get the text and concatenate it together
            for x in raw_data:
                text += " " + x["text"].lower()
        else: # Treat it as a raw file
            for x in input_file:
                text += x
        
    return text.strip()



'''
----------clean_data----------
- This function handles cleaning the input data, to make it more usable
-----Inputs-----
- text - The raw text string to clean
- drop_stop_words - Whether to drop any stop words from the text (defaults to false)
-----Output-----
- cleaned_text - The text string, cleaned to remove anything extraneous
'''
def clean_data(text, drop_stop_words = False):
    # First, clean any non-word strings
    #---Emails
    cleaned_text = re.sub("@\S+", "", text)
    #---Prices
    cleaned_text = re.sub("\$", "", cleaned_text)
    #---Links
    cleaned_text = re.sub("https?:\/\/.*[\r\n]*", "", cleaned_text)
    #---Hashtags
    cleaned_text = re.sub("#", "", cleaned_text)

    # Next, lowercase the whole text
    cleaned_text = cleaned_text.lower()

    # Next, remove any reserved words (mostly HTML/markdown reserved words)

    # Then, if specified, remove all stop words
    if drop_stop_words:
        pass

    # Finally, remove all non-alphanumeric characters
    #cleaned_text = re.sub("[^\w\d\s]", " ", cleaned_text)
    #---Remove any punctuation (Only uncommented if above line commented)
    #punct = set(string.punctuation)
    #cleaned_text = "".join([ch for ch in cleaned_text if ch not in punct])

    # Return the cleaned text
    return cleaned_text


'''
----------load_vocab----------
- This function handles converting the raw text string to an occurrence-counted vocabulary
-----Inputs-----
- file_path - The file path containing the raw text data to load
- text - The text to use (Optional)
-----Output-----
- vocab - The occurrence-counted vocab
'''
def load_vocab(file_path, text = "", remove_numeric = False):
    vocab = {}
    max_vocab_size = 0#2500
    file = file_path[:file_path.rfind('/')+1] + "vocab.json"
    # If the vocab file exists, load it. Otherwise, make & save it
    if os.path.exists(file):
        with open(file, "r") as vocab_file:
            vocab = json.load(vocab_file)
    else:# The file doesn't exist, create it
        if not text:
            # Load the data
            text = load_data(file_path)
            # Clean the data
            text = clean_data(text)
        # Using the text, make a vocab
        for word in text.split():
            if word in vocab:
                vocab[word] += 1
            else:
                vocab[word] = 1
        # If specified, remove any fully-numeric values
        if remove_numeric:
            for word in list(vocab.keys()):
                if word.isnumeric():
                    vocab.pop(word, None)
        # Cut the vocab down to size
        if (len(vocab) > max_vocab_size) and not (max_vocab_size == 0):
            # Sort the vocab by number of occurrences from greatest to least
            vocab = sorted(vocab.items(), key = lambda val: val[1], reverse = True)
            # Keep the top entries
            vocab = dict(vocab[:max_vocab_size])
        # Save the new vocab
        with open(file, "w") as vocab_file:
            vocab_file.write(json.dumps(vocab, indent = 4))
    # Return the vocab
    return vocab


'''
----------load_cooccurrence----------
- This function handles building a co-occurrence matrix using the vocab and text
-----Inputs-----
- file_path - The file path containing the raw text data
- text - The text to use
- vocab - The associated vocab
-----Output-----
- cooccurrence - The co-occurrence matrix
'''
def load_cooccurrence(file_path, text = "", vocab = "", window_size = 3):
    cooccurrence = {}
    file = file_path[:file_path.rfind('/')+1] + "co_occurrence.json"
    # If the cooccurrence file exists, load it. Otherwise, make & save it
    if os.path.exists(file):
        with open(file, "r") as cooccurrence_file:
            cooccurrence = json.load(cooccurrence_file)
    else:# The file doesn't exist. Make it
        if not text:
            # Load the data
            text = load_data(file_path)
            # Clean the data
            text = clean_data(text)
        if not vocab:
            # Load the vocab
            vocab = load_vocab(file_path, text)
        # Split the text on the spaces
        text = text.split()
        # Create the matrix
        for i in range(window_size, len(text)-window_size):
            if text[i] in vocab:
                if text[i] not in cooccurrence:
                    cooccurrence[text[i]]={}
                for j in range(i-window_size, i+window_size):
                    if text[j] in vocab:
                        if text[j] in cooccurrence[text[i]]:
                            cooccurrence[text[i]][text[j]] += 1
                        else:
                            cooccurrence[text[i]][text[j]] = 1
        # Save the new cooccurrence matrix
        with open(file, "w") as cooccurrence_file:
            cooccurrence_file.write(json.dumps(cooccurrence))
    # Return the completed matrix
    return cooccurrence


'''
*** Integrate this function into the clean_data function that ALREADY EXISTS
'''
def clean_and_save_dataset(path):
    with open(path) as input_file:
        raw_data = json.load(input_file)
        modified = raw_data
        for x in raw_data:
            text = " " + x["text"].lower()
            cleaned_data = clean_data(text)
            
            for m in modified:
                if m['id'] == x['id']:
                    m["text"] = cleaned_data
    
    with open('data/wiki_clean.json', 'w') as f:
        json.dump(modified, f)