@inproceedings{ge-etal-2025-mrfd,
title = "{MRFD}: Multi-Region Fusion Decoding with Self-Consistency for Mitigating Hallucinations in {LVLM}s",
author = "Ge, Haonan and
Wang, Yiwei and
Yang, Ming-Hsuan and
Cai, Yujun",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.858/",
doi = "10.18653/v1/2025.findings-emnlp.858",
pages = "15860--15879",
ISBN = "979-8-89176-335-7",
abstract = "Large Vision-Language Models (LVLMs) have shown strong performance across multimodal tasks. However, they often produce hallucinations{---}text that is inconsistent with visual input, due to the limited ability to verify information in different regions of the image. To address this, we propose **Multi-Region Fusion Decoding (MRFD)**, a training-free decoding method that improves factual grounding by modeling inter-region consistency. MRFD identifies salient regions using cross-attention, generates initial responses for each, and computes reliability weights based on Jensen-Shannon Divergence (JSD) among the responses. These weights guide a consistency-aware fusion of per-region predictions, using region-aware prompts inspired by Chain-of-Thought reasoning. Experiments across multiple LVLMs and benchmarks show that MRFD significantly reduces hallucinations and improves response factuality without requiring model updates."
}Markdown (Informal)
[MRFD: Multi-Region Fusion Decoding with Self-Consistency for Mitigating Hallucinations in LVLMs](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.858/) (Ge et al., Findings 2025)
ACL