import glob
import json, os, sys
from tqdm import tqdm
from utils.html_utils import *
from module.reflextion_crawler import AutoCrawler
from run_swde.task_prompt import swde_prompt

#from utils.gemini_api import genai_api as chatgpt
#from utils.custom_api import mixtral_87_api as chatgpt
#from utils.custom_api import mistral_7B_api as chatgpt
# from utils.custom_api import deepseek_33b_api as chatgpt
#from utils.custom_api import longllama_7b_api as chatgpt
from utils.custom_api import codellama_api as chatgpt
#from utils.custom_api import chatglm_6b_api as chatgpt
# from utils.custom_api import llama_13b_api as chatgpt
#from utils.ms_api_copy import ms_chatgpt as chatgpt
# from utils.ms_api_copy import ms_gpt4 as chatgpt
#from utils.custom_api import deepseek_chat as chatgpt

SCHEMA = {
    'auto': ['model', 'price', 'engine', 'fuel_economy'],
    'book': ['title', 'author', 'isbn_13', 'publisher', 'publication_date'],
    'camera': ['model', 'price', 'manufacturer'],
    'job': ['title', 'company', 'location', 'date_posted'],
    'movie': ['title', 'director', 'genre', 'mpaa_rating'],
    'nbaplayer': ['name', 'team', 'height', 'weight'],
    'restaurant': ['name', 'address', 'phone', 'cuisine'],
    'university': ['name', 'phone', 'website', 'type']
}

DATA_HOME = '/mnt/data122/harryhuang/swde/sourceCode'
OUTPUT_HOME = 'dataset/swde/codellama_wr'
PATTERN = 'origin'

xe = AutoCrawler(pattern=PATTERN, api=chatgpt)

num_seed_website = 3
per_page_repeat_time = 1

for field in SCHEMA.keys():
    if not os.path.exists(os.path.join(OUTPUT_HOME, PATTERN, field)):
        os.makedirs(os.path.join(OUTPUT_HOME, PATTERN, field))

    for website_path in glob.glob(os.path.join(DATA_HOME, field, '*')):
        website_name = website_path.split('/')[-1].split('(')[0]
        print('-'*200)
        print(website_name)
        if os.path.exists(os.path.join(OUTPUT_HOME, PATTERN, field, website_name) + f'_{PATTERN}.json'):
            continue
        if 'book-amazon' in website_name or 'camera-onsale' in website_name or 'camera-jr' in website_name or 'camera-compsource' in website_name or 'camera-buy' in website_name or 'movie-metacritic' in website_name or 'movie-rottentomatoes' in website_name or 'nbaplayer-wiki' in website_name or 'university-collegenavigator' in website_name or 'university-matchcollege' in website_name:
            continue
        if 'movie-hollywood' in website_name or 'nbaplayer-foxsports' in website_name or 'restaurant-tripadvisor' in website_name:
            continue
        
        webpage_list = glob.glob(os.path.join(website_path, '*'))
        
        xpath_rule = {}
        sorted(webpage_list)
        # with open(webpage_list[0], 'r') as f:
        #     html = f.read()
        #     for item in SCHEMA[field]:
        #         print(f"{swde_prompt[field]['meta']} {swde_prompt[field][item]}")
        #         xpath_rule[item] = xe.batch_rule_generation(f"{swde_prompt[field]['meta']} {swde_prompt[field][item]}", html)

        html_list = []
        for html_page in webpage_list[:num_seed_website]:
            with open(html_page, 'r') as f:
                html_list.append(f.read())
        for item in SCHEMA[field]:
            print('-'*150)
            instruction = f"{swde_prompt[field]['meta']} {swde_prompt[field][item]} {swde_prompt['meta']}"
            print(instruction)
            xpath_rule[item] = xe.rule_synthesis(website_name, html_list, instruction, per_page_repeat_time=per_page_repeat_time)
        
        with open(os.path.join(OUTPUT_HOME, PATTERN, field, website_name) + f'_{PATTERN}.json', 'w') as f:
            json.dump(xpath_rule, f, indent=4)