import glob
import json, os, sys
from tqdm import tqdm
from utils.html_utils import *
from module import XpathExtractor
from prompt import *
from run_swde.task_prompt import swde_prompt

SCHEMA = {
    'auto': ['model', 'price', 'engine', 'fuel_economy'],
    'book': ['title', 'author', 'isbn_13', 'publisher', 'publication_date'],
    'camera': ['model', 'price', 'manufacturer'],
    'job': ['title', 'company', 'location', 'date_posted'],
    'movie': ['title', 'director', 'genre', 'mpaa_rating'],
    'nbaplayer': ['name', 'team', 'height', 'weight'],
    'restaurant': ['name', 'address', 'phone', 'cuisine'],
    'university': ['name', 'phone', 'website', 'type']
}

DATA_HOME = '/mnt/data122/harryhuang/swde/sourceCode'
OUTPUT_HOME = 'dataset/swde/v1223'

xe = XpathExtractor()

for field in SCHEMA.keys():
    if not os.path.exists(os.path.join(OUTPUT_HOME, field)):
        os.makedirs(os.path.join(OUTPUT_HOME, field))

    for website_path in glob.glob(os.path.join(DATA_HOME, field, '*')):
        website_name = website_path.split('/')[-1].split('(')[0]
        print(website_name)
        # if os.path.exists(os.path.join(OUTPUT_HOME, field, website_name) + '.json'):
        #     continue
        webpage_list = glob.glob(os.path.join(website_path, '*'))
        
        xpath_rule = {}
        sorted(webpage_list)
        
        
        with open(os.path.join(OUTPUT_HOME, field, website_name) + '_rule.json', 'r') as f:
            xpath_rule = json.load(f)
        
        # Rule selection
        rule = {}
        webpage = webpage_list[0]
        with open(webpage, 'r') as f:
            html = f.read()
            for item in SCHEMA[field]:
                rule[item] = xe.select_xpath(html, f"{swde_prompt[field]['meta']} {swde_prompt[field][item]}", xpath_rule[item])

        #print(rule)
        with open(os.path.join(OUTPUT_HOME, field, website_name) + '_rule1.json', 'w') as f:
            json.dump(rule, f, indent=4)
        # # Rule execution
        # result_list = []
        # for webpage in tqdm(webpage_list[:100]):
        #     web_index = webpage.split('/')[-1].replace('.htm','')

        #     with open(webpage, 'r') as f:
        #         html = f.read()
            
        #     new_res = {'page': web_index}
        #     for item in SCHEMA[field]:
        #         #print(xpath_rule)
        #         item_value = xe.extract_with_xpath(html, xpath_rule[item][0])
        #         new_res[item] = item_value

        #         #print(item, item_value)
        #     result_list.append(new_res)

        # with open(os.path.join(OUTPUT_HOME, field, website_name) + '.json', 'w') as f:
        #     json.dump(result_list, f, indent=4)
    