@inproceedings{bai-pezzelle-2025-feel,
title = "If {I} feel smart, {I} will do the right thing: Combining Complementary Multimodal Information in Visual Language Models",
author = "Bai, Yuyu and
Pezzelle, Sandro",
editor = "Zhang, Wei Emma and
Dai, Xiang and
Elliot, Desmond and
Fang, Byron and
Sim, Mongyuan and
Zhuang, Haojie and
Chen, Weitong",
booktitle = "Proceedings of the First Workshop of Evaluation of Multi-Modal Generation",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.evalmg-1.3/",
pages = "24--39",
abstract = "Generative visual language models (VLMs) have recently shown potential across various downstream language-and-vision tasks. At the same time, it is still an open question whether, and to what extent, these models can properly understand a multimodal context where language and vision provide complementary information{---}a mechanism routinely in place in human language communication. In this work, we test various VLMs on the task of generating action descriptions consistent with both an image{'}s visual content and an intention or attitude (not visually grounded) conveyed by a textual prompt. Our results show that BLIP-2 is not far from human performance when the task is framed as a generative multiple-choice problem, while other models struggle. Furthermore, the actions generated by BLIP-2 in an open-ended generative setting are better than those by the competitors; indeed, human annotators judge most of them as plausible continuations for the multimodal context. Our study reveals substantial variability among VLMs in integrating complementary multimodal information, yet BLIP-2 demonstrates promising trends across most evaluations, paving the way for seamless human-computer interaction."
}
Markdown (Informal)
[If I feel smart, I will do the right thing: Combining Complementary Multimodal Information in Visual Language Models](https://preview.aclanthology.org/fix-sig-urls/2025.evalmg-1.3/) (Bai & Pezzelle, EvalMG 2025)
ACL