"""Utility functions for pre-processing hierarchical matrix tables."""

import os
import json
from bs4 import BeautifulSoup

from preprocess.structure import Table, table_to_struc


def load_tables(html_path):
    """Load the basic information of all tables."""
    table_dict = {}

    html_files = os.listdir(html_path)
    for file_name in html_files:
        file_path = os.path.join(html_path, file_name)
        with open(file_path, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f.read(), 'html.parser')
        table_id = int(file_name.split('.')[0])
        table = Table(table_id, None, None)
        table.set_html(soup)
        table_dict[table_id] = table
    
    return table_dict



def split_dump_dataset(dataset, paths, portion=[0.8, 0.1, 0.1]):
    """"Split the dataset into train/valid/test, and dump into the designated path. 
    @USER TODO: put sub-sentences of the same table in the same dataset.
    """
    n = len(dataset)
    print(f"Created {n} Samples in Total.")
    nbunch1, nbunch2 = int(n * portion[0]), int(n * portion[1])
    trainset = dataset[: nbunch1]
    validset = dataset[nbunch1: n - nbunch2]
    testset = dataset[n - nbunch2: ]

    train_path, valid_path, test_path = paths
    with open(train_path, 'w') as fw1:
        for sample in trainset:
            line = json.dumps(sample)
            fw1.write(line + '\n')

    with open(valid_path, 'w') as fw2:
        for sample in validset:
            line = json.dumps(sample)
            fw2.write(line + '\n')

    with open(test_path, 'w') as fw3:
        for sample in testset:
            line = json.dumps(sample)
            fw3.write(line + '\n')
    
    print(f"Dump {len(trainset)} train, {len(validset)} validation, {len(testset)} test samples.")

