@inproceedings{sim-etal-2025-vlms,
title = "Can {VLM}s Actually See and Read? A Survey on Modality Collapse in Vision-Language Models",
author = "Sim, Mong Yuan and
Zhang, Wei Emma and
Dai, Xiang and
Fang, Biaoyan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.1256/",
pages = "24452--24470",
ISBN = "979-8-89176-256-5",
abstract = "Vision-language models (VLMs) integrate textual and visual information, enabling the model to process visual inputs and leverage visual information to generate predictions. Such models are demanding for tasks such as visual question answering, image captioning, and visual grounding. However, some recent work found that VLMs often rely heavily on textual information, ignoring visual information, but are still able to achieve competitive performance in vision-language (VL) tasks. This survey reviews modality collapse analysis work to provide insights into the reason for this unintended behavior. It also reviews probing studies for fine-grained vision-language understanding, presenting current findings on information encoded in VL representations and highlighting potential directions for future research."
}
Markdown (Informal)
[Can VLMs Actually See and Read? A Survey on Modality Collapse in Vision-Language Models](https://preview.aclanthology.org/landing_page/2025.findings-acl.1256/) (Sim et al., Findings 2025)
ACL