@inproceedings{catapang-2026-image,
title = "When Image and Text Disagree: Cross-Modal Evidence Conflict in Multimodal Retrieval-Augmented Generation",
author = "Catapang, Jasper Kyle",
editor = "Murray, Kenton and
Kriz, Reno",
booktitle = "Proceedings of the 2nd Workshop on Multimodal Augmented Generation via Multimodal Retrieval ({MAGM}a{R} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.magmar-main.3/",
pages = "1--10",
ISBN = "979-8-89176-425-5",
abstract = "This paper introduces the Cross-Modal Conflict Benchmark (CMC-Bench) to evaluate how multimodal retrieval-augmented generation (RAG) systems handle contradicting evidence between retrieved text and images. Using 3,768 instances from ChartQA and MMMU evaluation splits, the study benchmarks four open vision-language models (VLMs) across four conflict types (factual, temporal, entity, and granularity) and four evidence conditions: aligned (both modalities support the gold answer), image-correct (image supports the gold and text contradicts it), text-correct (text supports the gold and the image is wrong or swapped), and both-wrong(neither modality supports the gold). Key findings reveal that cross-modal disagreement severely degrades performance, with change in accuracy between 0.17 and 0.46 relative to aligned evidence. Results show models often exhibit a modality lean rather than reliable arbitration, with text-leaning systems particularly vulnerable when only the image is correct. Furthermore, merging abstention and fabrication into a single hallucination score obscures critical behavioral differences; for instance, Qwen3-VL-4B abstains on 31.7{\%} of conflicts, while Gemma-3n-E2B fabricates unsupported answers in 51.9{\%} of conflicts. Multimodal RAG evaluation should explicitly distinguish abstention from fabrication to assess reliability accurately."
}Markdown (Informal)
[When Image and Text Disagree: Cross-Modal Evidence Conflict in Multimodal Retrieval-Augmented Generation](https://preview.aclanthology.org/ingest-acl-workshops/2026.magmar-main.3/) (Catapang, MAGMaR 2026)
ACL