import base64
import requests
import json
import re
import os
import random
import tqdm
import argparse

api_key = "xxxxxxxxx"

# Function to encode the image
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')



def prompt_gpt4v(image_id):
    image_path = f"/home/user/llavafinetune/images/{image_id}.jpg"
    base64_image = encode_image(image_path)
    headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {api_key}"
    }

    payload = {
    "model": "gpt-4-turbo",
    "messages": [
        {
        "role": "user",
        "content": [
            {
            "type": "text",
            "text": "Generate five diverse ambiguous questions for a vision language model to answer when it is given an image. We expect the vision language model to ask further clarification questions when given ambiguous questions, or say 'I don't know' when given unanswerable questions, or challenge false assumptions when given false premises questions.\
               Each question should match one following category. Try to be creative and be diverse. Do not just follow the examples given but try to think about new types according to the category explanation.\
               1. Subject Ambiguity: When the image has multiple people or objects of the same type, generate an ambiguous question that does not clearly specify which individual person or object is being asked. When given this type of questions, further clarifying questions such as 'Which person you are asking' is expected. Don't generate questions that can be directly answered without further asking. \
               For example, if there are multiple men in the image, instead of asking 'Who is wearing sunglasses?' which can lead to a specific answer that 'A and B', ask 'Is the man wearing sunglasses?' by not indicating which specific man you are asking. \
               Similar questions can also be asked for objects. For example: 'What is the pattern of that cushion?' when given an image where there are multiple cushions. If there is only one person or one object of the same type in the image, then output 'N/A' for this category.\
               2. Unclear User Background: Questions that compare the scenes or persons with you when no information about you is provided: (e.g.: “Is the car the same color as mine?” without information about which color your car is.).\
               3. Subjective Interpretations: Questions that rely on subjective judgment without clear criteria or specific human preference when no preferences are given. (e.g., 'Which painting is the best?','Is this style modern?' where 'best' and 'modern' are subjective words. Or 'Which person you are more likely to befriend with?' when no human preferences are known). \
               4. Unanswerable Qusestions: Questions that completely cannot be answered or inferred based on the image alone, even with clarification questions (e.g., 'What is the name of the person in the image?' when the image does not contain any text or name tags). Avoid questions that have uncertainty or ambiguity on whether it can be answered. \
               5. False Premise: Tricky Questions that give false premises or incorrect assumptions.(e.g., 'Is the woman wearing a red shirt' for an image containing two men and one of them is wearing a red shirt is a tricky false premise question since there is no woman.\
               Or 'What type of plants are visible on the balcony outside the window?' when there is indeed a balcony outside the window but there are no plants.) The goal of this category is to fool the model so that it will fail to point out the false assumptions.\
               Remember to make each question distinct from the others, encompassing a wide range of ambiguity that would necessitate a follow-up question for clarity.\
               The output should strictly follow the example format below: \
               1. Subject Ambiguity: Is he looking at the sky?\
               2. Unclear User Background: Is the room layout similar to my study area? \
               3. Subjective Interpretations: Does this room reflect a vintage or modern aesthetic?\
               4. Unanswerable Questions: What is the name of the person who owns this computer?\
               5. False Premise: What type of plants are visible on the balcony outside the window?\
               Follow this format exactly and do not include additional explanations or text outside of the specified question format. But make sure to replace the example questions with your own unique questions."

            # "text": prompt
            },
                
            {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{base64_image}"
            }
            }
        ]
        }
    ],
    "max_tokens": 300
    }

    attempt_count = 0
    while attempt_count < 3:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload)
        output = response.json()
        questions = output['choices'][0]['message']['content']
        pattern = r'^(\d+)\.\s+(.+?):\s+(.+)$'
        matches = re.findall(pattern, questions, re.MULTILINE)
        result = {category.strip(): question for _, category, question in matches}
        if not result:
            print(f"Attempt {attempt_count + 1}: Received empty result.")
            attempt_count += 1
        else:   
            result['image_id'] = image_id
            return result
            
    print(f"Failed to generate questions for image {image_id}.")
    return None







dir = '/home/user/llavafinetune/images'
files = os.listdir(dir)
# samples = random.sample(files, 5000)

parser = argparse.ArgumentParser()
parser.add_argument("--start", type=int)
parser.add_argument("--end", type=int)
parser.add_argument("--output", type=str)
args = parser.parse_args()
samples = files[args.start:args.end]

result = []
count = 0

for file in tqdm.tqdm(samples):
    image_id = os.path.splitext(file)[0]
    try:
        a = prompt_gpt4v(image_id)
    except:
        print(f"Failed to generate questions for image {image_id}.")
        continue
    if a is None:
        continue
    else:
        result.append(a)
        count += 1
    if count % 100 == 0:
        with open(args.output, "a") as f:
            for item in result:
                f.write(json.dumps(item) + "\n")
        result = []
