@inproceedings{chen-etal-2025-vlm,
title = "{VLM} Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training",
author = "Chen, Zhanpeng and
Xu, Chengjin and
Qi, Yiyan and
Jiang, Xuhui and
Guo, Jian",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.432/",
doi = "10.18653/v1/2025.findings-emnlp.432",
pages = "8140--8158",
ISBN = "979-8-89176-335-7",
abstract = "Vision-language Models (VLMs) have demonstrated remarkable capabilities in processing and generating content across multiple data modalities. However, a significant drawback of VLMs is their reliance on static training data, leading to outdated information and limited contextual awareness. This static nature hampers their ability to provide accurate and up-to-date responses, particularly in dynamic or rapidly evolving contexts. To address these limitations, we propose RagVL, a novel framework with knowledge-enhanced reranking and noise-injected training. We instruction-tune the VLM with a simple yet effective instruction template to induce its ranking ability and serve it as a reranker to precisely filter the top-k retrieved images. For generation, we inject visual noise during training at the data and token levels to enhance the generator{'}s robustness. Extensive experiments on four datasets verify the effectiveness of our method. Code and models are available at https://anonymous.4open.science/r/RagVL-F694."
}Markdown (Informal)
[VLM Is a Strong Reranker: Advancing Multimodal Retrieval-augmented Generation via Knowledge-enhanced Reranking and Noise-injected Training](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.432/) (Chen et al., Findings 2025)
ACL