from datasets import load_dataset
from PIL import Image
from io import BytesIO
import requests
import os
import json
import uuid
import shutil



with open('/home/user/llavafinetune/data/llava/llava_preference_data.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

new_dataset = []
for item in data:
    for key in item.keys():
        if key != 'image_id':
            new_item = {}
            new_item['id'] = item['image_id']
            new_item['img_path'] = '/home/user/llavafinetune/images/' + item['image_id'] + '.jpg'
            new_item['prompt'] = item[key]['Question']
            new_item['rejected'] = item[key]['Rejected']
            new_item['preferred'] = item[key]['Preferred']
            new_item['question_type'] = key
            new_dataset.append(new_item)
            

VLFeedback = load_dataset("MMInstruction/VLFeedback")

# print(dataset['train'][0]['completions'])
# for item in dataset['train']:
def process_dataset(item):
    # for item in dataset:
    scores = []
    for i in item['completions']['annotations']:
        score = (int(i['Helpfulness']['Rating']) + int(i['Visual Faithfulness']['Rating']))/2
        scores.append(score)
    preferred_index = scores.index(max(scores))
    rejected_index = scores.index(min(scores))
    item['preferred'] = item['completions']['response'][preferred_index]
    item['rejected'] = item['completions']['response'][rejected_index]  
    return item

VLFeedback = VLFeedback.map(process_dataset)
   
feedbacks = {
    "Subject Ambiguity": "The question is ambiguous on which subject it is referring to. You may need to ask for clarification about it.",
    "Unclear User Background": "The user's background is unknown. You may need to ask for more information about it so that you can answer the question.",
    "Subjective Interpretations": "The question is asking you to make some subjective judgement. You may need to ask for some criteria or standards to make the judgement.",
    "Unanswerable Questions": "You may need to think about whether this question can be answered based on the image itself.",
    "False Premise": "There may be some false premise in the question. You may need to point it out if you find some.",
    "Latent Human Preferences": "You may need to interact with the user to elicit their preferences so that you can give a more user-targeted and customized answer."
}

def process_and_save(dataset, output_folder, output_file_name):

    # Initialize list to hold all JSON data
    json_data_list = []


        
    # Process and save images and labels
    for item in dataset:
        json_data = {
            "id": item['id'],
            "image": item['img_path'],
            "conversations": [
                {
                    "from": "human",
                    "value": item['prompt']
                },
                {
                    "from": "gpt",
                    "value": item['rejected']
                },
                {
                    "from": "human",
                    "value": feedbacks[item['question_type']]
                },
                {
                    "from": "gpt",
                    "value": item['preferred']
                },
                
            ]
        }


        # Append to list
        json_data_list.append(json_data)


    # Save the JSON data list to a file
    json_output_path = os.path.join(output_folder, output_file_name)
    with open(json_output_path, 'w') as json_file:
        json.dump(json_data_list, json_file, indent=4)




process_and_save(new_dataset, '/home/user/llavafinetune/data', 'preference_dataset_multiturn.json')