from argparse import ArgumentParser
from icecream import ic
import os
import json

from qa.global_preprocess import load_tables, load_anno_and_struc
from qa.data2text.read_annotations import read_annotation
from qa.data2text.align import align_table_annotation


ROOT_PATH = '/data/home/hdd3000/USER/HMT/'
DATA_PATH = 'qa/data/'


def main():
    annotation_files = os.listdir(os.path.join(ROOT_PATH, DATA_PATH, args.annotation_dir))
    valid_table_files = os.listdir(os.path.join(ROOT_PATH, DATA_PATH, args.valid_table_dir))
    annotation_files = [f for f in annotation_files if f.split('.')[0] + '.json' in valid_table_files]

    table_dict = load_tables(os.path.join(ROOT_PATH, DATA_PATH, args.html_dir))
    operator = None
    aligned_dict = {}
    for file in annotation_files:
        print(f"Processing {file}")
        anno_file_path = os.path.join(ROOT_PATH, DATA_PATH, args.annotation_dir, file)
        table_id = int(file.split('.')[0])
        structure, sentences = load_anno_and_struc(anno_file_path, table_id, table_dict[table_id], operator)
        for subsent_id, subsent_dict in align_table_annotation(table_id, structure, sentences):
            subsent_dict.pop('structure')
            aligned_dict[subsent_id] = subsent_dict
            ic(subsent_id)
            ic(subsent_dict)
    with open(os.path.join(ROOT_PATH, DATA_PATH, args.output_file), 'w') as f:
        json.dump(aligned_dict, f)
    print("Done.")


if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('--html_dir', type=str, default='html/')
    parser.add_argument('--annotation_dir', type=str, default='annotations/')
    parser.add_argument('--valid_table_dir', type=str, default='raw_input/table_filtered/')
    parser.add_argument('--output_file', type=str, default='raw_input/parsed_info.json')
    args = parser.parse_args()

    main()
