from utils.ms_api_copy import ms_chatgpt as chatgpt
from utils.html_utils import simplify_html
import json
from lxml import etree, html

def extract_with_xpath(html_content:str, 
                        xpath:str) -> list[str]:
    """Xpath Parser

    Args:
        html_content (str): text of HTML
        xpath (str): the string of xpath

    Returns:
        list[str]: result extracted by xpath
    """
    if xpath.strip():
        ele = etree.HTML(html_content) # type: ignore
        print([i.text for i in ele.xpath(xpath)])
        return etree.tostring(ele.xpath(xpath))
        #return [item if isinstance(item, str) else item.text for item in ele.xpath(xpath)]
    else:
        return []
    
def find_common_ancestor(html_content:str, xpath:str):
    tree = etree.HTML(html_content)
    nodes = tree.xpath(xpath)
    # 获取每个节点的所有祖先节点
    ancestors_list = [set(n.xpath('ancestor::*')) for n in nodes]

    # 找到所有祖先集合的交集，即共同祖先
    common_ancestors = set.intersection(*ancestors_list)

    # 选择最近的共同祖先（即最后一个共同祖先）
    nearest_common_ancestor = max(common_ancestors, key=lambda x: x.getroottree().getpath(x).count('/'))
    ancestor_string = etree.tostring(nearest_common_ancestor, pretty_print=True, encoding='unicode')
    return ancestor_string


meta_prompt = "Suppose you're a web parser that is good at reading and understanding the HTML code and can give clear executable code on the brower."

Instruction = "Here's a webpage with detail information of a movie. Please extract the tuntime of the movie. It's worth noticing that the candidate attribute values are the non-empty strings contained in text nodes in the corresponding DOM tree, and one page may contain multiple distinct values that correspond to an attribute."

Output_format = """Only output the result in the following Json format without other words:
{{
    "result": "" # result of the instruction 
}}"""

Xpath_format = """Only output the xpath to the value '1 hrs. 54 min.' in the following Json format without other words:
{{
    "xpath": "" # xpath to the element
}}"""

Thinking_format = '''
Output a brief thinking of how to locate the information, including with the help of other element, roughly determine the approximate position of elements in HTML. The output format are as follow:

'''

COT_format = """Give some steps that can help finding the director with writing a stable and reliable Xpath from general part of the HTML to specific element finding in the HTML. 
For example, for the task that extract the similar works in the HTML, we can confirm the following steps.
{{
    "step1": "Find the result table that contains the detail of the movie",
    "step2": "Find the blocks that indicate similar works in the HTML.",
    "step3": "Extract the similar works from the neighbor DOM elements"
}}

Output the result in the following Json format without other words:
{{
    "step1": "" #
    "step2": "" #
    ...
}}"""

with open('/mnt/data122/harryhuang/swde/sourceCode/movie/movie-boxofficemojo(2000)/0435.htm') as f:
    html_content = f.read()
    html_content = simplify_html(html_content, ['class', 'id'])

# COT prompts
# query = f'{meta_prompt}/n{Instruction}/n{COT_format}/nHere\'s the HTML code./n```{html}```'
# response = chatgpt(query)
# new_html = html
# res = json.loads(response)
# print(res)

# for item in res.items():
#     query = f'{meta_prompt}/n{item}/n{Xpath_format}/nHere\'s the HTML code./n```{new_html}```'
#     response = chatgpt(query)
#     res = json.loads(response)
#     print(res)
#     print(extract_with_xpath(new_html, res['xpath']))
query = f'{meta_prompt}/n{Instruction}/n{Xpath_format}/nHere\'s the HTML code./n```{html_content}```'
response = chatgpt(query)
print(response)
res = json.loads(response)
print(res)


# print(len(html_content))
# new_html_content = find_common_ancestor(html_content, res['xpath'])
# print(len(new_html_content))
# print()
# # Xpath writter:
# query = f'{meta_prompt}/n{Instruction}/n{Xpath_format}/nHere\'s the HTML code./n```{new_html_content}```'
# response = chatgpt(query)
# res = json.loads(response)
# print(res)
# print(extract_with_xpath(new_html_content, res['xpath']))