@inproceedings{yi-ge-etal-2025-drum,
title = "{DRUM}: Learning Demonstration Retriever for Large {MU}lti-modal Models",
author = "Yi-Ge, Ellen and
Gao, Jiechao and
Han, Wei and
Zhu, Wei",
editor = "Zhao, Jin and
Wang, Mingyang and
Liu, Zhu",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-srw.83/",
pages = "1051--1063",
ISBN = "979-8-89176-254-1",
abstract = "Recently, large language models (LLMs) have demonstrated impressive capabilities in dealing with new tasks with the help of in-context learning (ICL). In the study of Large Vision-Language Models (LVLMs), when implementing ICL, researchers usually adopt the naive strategies like fixed demonstrations across different samples, or selecting demonstrations directly via a visual-language embedding model. These methods do not guarantee the configured demonstrations fit the need of the LVLMs. To address this issue, we propose a novel framework, demonstration retriever for large multi-modal model (DRUM), which fine-tunes the CLIP embedding model to better meet the LVLM{'}s needs. First, we discuss the retrieval strategies for a visual-language task, assuming an embedding model is given. And we propose to concate the image and text embeddings to enhance the retrieval performance. Second, we propose to re-rank the the embedding model{'}s retrieved demonstrations via the LVLM{'}s feedbacks, and calculate a list-wise ranking loss for training the embedding model. Third, we propose an iterative demonstration mining strategy to improve the training of the embedding model. Through extensive experiments on 3 types of visual-language tasks, 7 benchmark datasets, our DRUM framework is proven to be effective in boosting the LVLM{'}s in-context learning performance via retrieving more proper demonstrations."
}
Markdown (Informal)
[DRUM: Learning Demonstration Retriever for Large MUlti-modal Models](https://preview.aclanthology.org/landing_page/2025.acl-srw.83/) (Yi-Ge et al., ACL 2025)
ACL