from datasets import load_dataset
from PIL import Image
from io import BytesIO
import requests
import os
import json
import uuid
import shutil



with open('/home/user/llavafinetune/llava_preference_data.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

new_dataset = []
for item in data:
    for key in item.keys():
        if key != 'image_id':
            new_item = {}
            new_item['id'] = item['image_id']
            new_item['img_path'] = '/home/user/llavafinetune/images/' + item['image_id'] + '.jpg'
            new_item['prompt'] = item[key]['Question']
            new_item['rejected'] = item[key]['Rejected']
            new_item['preferred'] = item[key]['Preferred']
            new_dataset.append(new_item)
            

VLFeedback = load_dataset("MMInstruction/VLFeedback")

# print(dataset['train'][0]['completions'])
# for item in dataset['train']:
def process_dataset(item):
    # for item in dataset:
    scores = []
    for i in item['completions']['annotations']:
        score = (int(i['Helpfulness']['Rating']) + int(i['Visual Faithfulness']['Rating']))/2
        scores.append(score)
    preferred_index = scores.index(max(scores))
    rejected_index = scores.index(min(scores))
    item['preferred'] = item['completions']['response'][preferred_index]
    item['rejected'] = item['completions']['response'][rejected_index]  
    return item

VLFeedback = VLFeedback.map(process_dataset)
   

def process_and_save(dataset, output_folder, output_file_name):

    # Initialize list to hold all JSON data
    json_data_list = []


        
    # Process and save images and labels
    for item in dataset:
        json_data = {
            "id": item['id'],
            "image": item['img_path'],
            "conversations": [
                {
                    "from": "human",
                    "value": item['prompt']
                },
                {
                    "from": "gpt",
                    "value": {'rejected' : item['rejected']}
                }
            ]
        }


        # Append to list
        json_data_list.append(json_data)
        
        json_data2 = {
            "id": item['id'],
            "image": item['img_path'],
            "conversations": [
                {
                    "from": "human",
                    "value": item['prompt']
                },
                {
                    "from": "gpt",
                    "value": {'preferred' : item['preferred']}
                }
            ]
        }


        # Append to list
        json_data_list.append(json_data2)


    # Save the JSON data list to a file
    json_output_path = os.path.join(output_folder, output_file_name)
    with open(json_output_path, 'w') as json_file:
        json.dump(json_data_list, json_file, indent=4)



process_and_save(VLFeedback['train'], '/home/user/llavafinetune/data','VLfeedback_dataset_seperate.json')
process_and_save(new_dataset, '/home/user/llavafinetune/data', 'preference_dataset_seperate.json')