from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

PRETRAINED_PATH = "Salesforce/blip2-opt-2.7b"
PROMPT = "What is author's intention of this image? "
MAX_ANSWER_TOKENS = 256
DEVICE = 'cuda'
DTYPE = torch.float16
FP16 = True

processor = Blip2Processor.from_pretrained(PRETRAINED_PATH)
model = Blip2ForConditionalGeneration.from_pretrained(PRETRAINED_PATH, load_in_8bit=FP16, torch_dtype=DTYPE)

def blip2(img_path, prompt=PROMPT, max_answer_tokens=MAX_ANSWER_TOKENS):
    image = Image.open(img_path)

    inputs = processor(images=image, prompt=prompt, return_tensors="pt").to(device=DEVICE, dtype=DTYPE)

    generated_ids = model.generate(**inputs, max_length=len(prompt)+max_answer_tokens)
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

    return generated_text
