@inproceedings{adhikari-lapata-2025-debating,
title = "Debating for Better Reasoning in Vision-Language Models",
author = "Adhikari, Ashutosh and
Lapata, Mirella",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.853/",
doi = "10.18653/v1/2025.findings-emnlp.853",
pages = "15766--15784",
ISBN = "979-8-89176-335-7",
abstract = "As Large Language Models (LLMs) gain expertise across diverse domains and modalities, scalable oversight becomes increasingly challenging, particularly when their capabilities may surpass human evaluators. Debate has emerged as a promising mechanism for enabling such oversight. We extend the debate paradigm to a multimodal setting, exploring its potential for blind models to supervise and enhance the performance of sighted ones. We focus on visual question answering (VQA), where two ``sighted'' expert vision-language models debate an answer, while a ``blind'' (text-only) judge adjudicates based solely on the quality of the arguments. In our framework, the experts only defend answers aligned with their beliefs, thereby obviating the need for explicit role-playing and concentrating the debate on instances of expert disagreement. Experiments on several multimodal tasks demonstrate that the debate framework consistently outperforms individual expert models. Moreover, judgments from blind LLMs can be used to instil reasoning capabilities in vision-language models through fine-tuning."
}Markdown (Informal)
[Debating for Better Reasoning in Vision-Language Models](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.853/) (Adhikari & Lapata, Findings 2025)
ACL