@inproceedings{li-etal-2024-mvp,
title = "{MVP}-Bench: Can Large Vision-Language Models Conduct Multi-level Visual Perception Like Humans?",
author = "Li, Guanzhen and
Xie, Yuxi and
Kan, Min-Yen",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.789/",
doi = "10.18653/v1/2024.findings-emnlp.789",
pages = "13505--13527",
abstract = "Humans perform visual perception at multiple levels, including low-level object recognition and high-level semantic interpretation such as behavior understanding. Subtle differences in low-level details can lead to substantial changes in high-level perception. For example, substituting the shopping bag held by a person with a gun suggests violent behavior, implying criminal or violent activity. Despite significant advancements in various multimodal tasks, Large Visual Language Models (LVLMs) remain unexplored in their capabilities to conduct such multi-level visual perceptions.To investigate the perception gap between LVLMs and humans, we introduce MVP-Bench, the first visual{--}language benchmark systematically evaluating both low- and high-level visual perception of LVLMs. We construct MVP-Bench across natural and synthetic images to investigate how manipulated content influences model perception. Using MVP-Bench, we diagnose the visual perception of 10 open-source and 2 closed-source LVLMs, showing that high-level perception tasks significantly challenge existing LVLMs. The state-of-the-art GPT-4o only achieves an accuracy of 56{\%} on Yes/No questions, compared with 74{\%} in low-level scenarios. Furthermore, the performance gap between natural and manipulated images indicates that current LVLMs do not generalize in understanding the visual semantics of synthetic images as humans do."
}
Markdown (Informal)
[MVP-Bench: Can Large Vision-Language Models Conduct Multi-level Visual Perception Like Humans?](https://preview.aclanthology.org/fix-sig-urls/2024.findings-emnlp.789/) (Li et al., Findings 2024)
ACL