from PIL import Image
from transformers import InstructBlipProcessor, InstructBlipForConditionalGeneration
import torch

PRETRAINED_PATH = "Salesforce/instructblip-vicuna-7b"
PROMPT = "What is author's intention of this image? "
DEVICE = 'cuda'
DTYPE = torch.float16
FP16 = True

def load_instructblip():
  processor = InstructBlipProcessor.from_pretrained(PRETRAINED_PATH)
  model = InstructBlipForConditionalGeneration.from_pretrained(PRETRAINED_PATH, load_in_8bit=FP16, torch_dtype=DTYPE)
  return processor, model

def instructblip(processor, model, image_path, prompt=PROMPT):
    image = Image.open(image_path)
    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device=DEVICE, dtype=DTYPE)

    generated_ids = model.generate(
        **inputs,
        # do_sample=False,
        num_beams=5,
        max_length=1024,
        min_length=1,
        # top_p=0.9,
        repetition_penalty=1.5,
        length_penalty=1.0,
        temperature=1,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    return generated_text

