@article{sterz-etal-2025-dare,
title = "{DARE}: Diverse Visual Question Answering with Robustness Evaluation",
author = "Sterz, Hannah and
Pfeiffer, Jonas and
Vuli{\'c}, Ivan",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/fix-opsupmap-display/2025.tacl-1.52/",
doi = "10.1162/tacl.a.29",
pages = "1121--1145",
abstract = "Vision Language Models (VLMs) extend remarkable capabilities of text-only large language models and vision-only models, being able to learn from and process multi-modal vision-text input. While modern VLMs perform well on a number of standard image classification and image-text matching tasks, they still struggle with a number of crucial vision-language (VL) reasoning abilities such as counting and spatial reasoning. Moreover, while they might be very brittle to small variations in instructions and/or evaluation protocols, existing benchmarks fail to evaluate their robustness (or rather the lack of it). In order to couple challenging VL scenarios with comprehensive robustness evaluation, we introduce DARE, Diverse Visual Question Answering with Robustness Evaluation, a carefully created and curated multiple-choice VQA benchmark. DARE evaluates VLM performance on five diverse categories and includes four robustness-oriented evaluations based on the variations of prompts, the subsets of answer options, the output format, and the number of correct answers. Among a spectrum of other findings, we report that state-of-the-art VLMs still struggle with questions in most categories and are unable to consistently deliver their peak performance across the tested robustness evaluations. Consequently, our work calls for the systematic addition of robustness evaluations in future VLM research."
}Markdown (Informal)
[DARE: Diverse Visual Question Answering with Robustness Evaluation](https://preview.aclanthology.org/fix-opsupmap-display/2025.tacl-1.52/) (Sterz et al., TACL 2025)
ACL