# cross_validation.py
# This file handles splitting a given dataset into multiple folds for k-fold cross-validation


# Internal Imports

# External Imports
import os
import glob
import random

# Global Variables


'''
----------generate_folds----------
- This function handles creating multiple folds for k-fold cross-validation
-----Inputs-----
- filename - The file to source analogies from
- num_folds - The number of folds to generate (defaults to 5)
-----Output-----
- N/A - The resultant folds are written to auto-generated files
'''
def generate_folds(filename, num_folds = 5):
    output_file = filename.split('.')[0]
    # Start by removing any previously-generated folds
    fileList = glob.glob(f'{output_file}-fold*.txt')
    for file in fileList:
        os.remove(file)

    # Open k file writers - for temp files
    fileWriters = []
    for i in range(num_folds):
        fileWriters.append(open(f'{output_file}-temp{i+1}.txt', 'w'))

    # Start reading in analogy data, distributing it to the respective files
    print(f"PREPARATION: Splitting the data into {num_folds} parts")
    source = open(filename, 'r')
    for line in source:
        # If the line can be split into 4 distinct words, throw it to one of the output streams at random
        if (len(line.split()) == 4):
            # Generate a random number between 0 and k
            index = random.randint(0, num_folds-1)

            # Write the line to the chosen file
            fileWriters[index].write(line)
    source.close()

    
    # Close all the file writers
    for x in fileWriters:
        x.close()

    # Start preparing the output files
    print("PREPARATION: Files split. Begin writing folds")
    for i in range(num_folds):
        fold_file = f'{output_file}-fold{i+1}.txt'
        # Open the output file
        output_file_writer = open(fold_file, 'w')
        # Open the other folds as input
        for j in range(num_folds):
            fileReader = ''
            # If the index doesn't match, write the analogies to the target file
            if (i != j):
                fileReader = open(f'{output_file}-temp{j+1}.txt', 'r')
                for line in fileReader:
                    output_file_writer.write(line)
        # Close the output file
        output_file_writer.close()
        print(f"WRITING: Fold {i+1} written to {fold_file}")
    
    # Delete the temporary files
    print("CLEANUP: Removing temporary files")
    fileList = glob.glob(f'{output_file}-temp*.txt')
    for file in fileList:
        os.remove(file)




if (__name__ == "__main__"):
    # If this file is being run, generate five folds given the dataset in the config file
    generate_folds('data/analogies/analogy_task.txt')