from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
import torch
torch.manual_seed(1234)


def load_qwenVL():
  tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL", trust_remote_code=True)

  # use bf16
  # use cuda device
  model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen-VL", device_map="cuda", trust_remote_code=True).eval()
  return tokenizer, model

def qwenVL(tokenizer, model, img, prompt):
  query = tokenizer.from_list_format([
    {'image': img},
    {'text': prompt},
  ])
  inputs = tokenizer(query, return_tensors='pt')
  inputs = inputs.to(model.device)
  pred = model.generate(**inputs)
  response = tokenizer.decode(pred.cpu()[0], skip_special_tokens=False)
  return response.split('ANSWER:')[1]
  # <img>https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg</img>Generate the caption in English with grounding:<ref> Woman</ref><box>(451,379),(731,806)</box> and<ref> her dog</ref><box>(219,424),(576,896)</box> playing on the beach<|endoftext|>
