import glob
import json, os, sys
from tqdm import tqdm
from utils.html_utils import *
from module.reflextion_crawler import AutoCrawler
from module.prompt import *
from run_swde.task_prompt import swde_prompt


from utils.custom_api import mixtral_87_api as chatgpt

SCHEMA = {
    'auto': ['model', 'price', 'engine', 'fuel_economy'],
    'book': ['title', 'author', 'isbn_13', 'publisher', 'publication_date'],
    'camera': ['model', 'price', 'manufacturer'],
    'job': ['title', 'company', 'location', 'date_posted'],
    'movie': ['title', 'director', 'genre', 'mpaa_rating'],
    'nbaplayer': ['name', 'team', 'height', 'weight'],
    'restaurant': ['name', 'address', 'phone', 'cuisine'],
    'university': ['name', 'phone', 'website', 'type']
}

DATA_HOME = '/mnt/data122/harryhuang/swde/sourceCode'

web_len_split = {'0': [],
             '1': [],
             '2': [],
             '3': []}
web_len = []

for field in SCHEMA.keys():
    for website_path in glob.glob(os.path.join(DATA_HOME, field, '*')):
        website_name = website_path.split('/')[-1].split('(')[0]
        print(website_name)
        webpage_list = glob.glob(os.path.join(website_path, '*'))
        
        xpath_rule = {}
        sorted(webpage_list)

        web_length = 0
        result_list = []
        for webpage in tqdm(webpage_list[:100]):
            web_index = webpage.split('/')[-1].replace('.htm','')

            with open(webpage, 'r') as f:
                html_content = simplify_html(f.read())
            web_length += len(html_content)

        web_len.append((website_name, web_length // 100))

web_len = sorted(web_len, key=lambda x: x[1])
for web in web_len[:20]:
    web_len_split['0'].append(web)
for web in web_len[20:40]:
    web_len_split['1'].append(web)
for web in web_len[40:60]:
    web_len_split['2'].append(web)
for web in web_len[60:80]:
    web_len_split['3'].append(web)



        # if web_length // 1000000 == 0:
        #     web_len_split['0'].append(website_name)
        # elif web_length // 1000000 == 1:
        #     web_len_split['1'].append(website_name)
        # elif web_length // 1000000 == 2:
        #     web_len_split['2'].append(website_name)
        # elif web_length // 1000000 == 3:
        #     web_len_split['3'].append(website_name)
        # elif web_length // 1000000 == 4:
        #     web_len_split['4'].append(website_name)
        # else:
        #     web_len_split['5+'].append(website_name)
with open('run_swde.json', 'w') as f:
    json.dump(web_len_split, f, ensure_ascii=False, indent=4) 
    