@inproceedings{luo-etal-2024-textual,
title = "How Does the Textual Information Affect the Retrieval of Multimodal In-Context Learning?",
author = "Luo, Yang and
Zheng, Zangwei and
Zhu, Zirui and
You, Yang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2024.emnlp-main.305/",
doi = "10.18653/v1/2024.emnlp-main.305",
pages = "5321--5335",
abstract = "The increase in parameter size of multimodal large language models (MLLMs) introduces significant capabilities, particularly multimodal in-context learning, where MLLMs enhance task performance without updating pre-trained parameters. However, this effectiveness hinges on the appropriate selection of in-context examples, a process currently biased towards visual data, overlooking textual information. More importantly, the area of supervised retrievers for retrieval of multimodal in-context learning, crucial for optimal in-context example selection, continues to be investigated. Our study provides an in-depth evaluation of the impact of textual information on the unsupervised selection of in-context examples in multimodal contexts, uncovering a notable sensitivity of retriever performance to the employed modalities. Based on the above finding, we introduce a novel supervised MLLM prompt retriever MSIER that leverages a trained retriever based on MLLM`s confidence to select examples, which enhances multimodal in-context learning efficiency. This approach is validated through extensive testing across three different tasks, demonstrating the method`s effectiveness. Additionally, we investigate the influence of modalities on our supervised retrieval method`s training and explore the transferability of the supervised prompt retriever. This exploration paves the way for future advancements, highlighting the potential for refined in-context learning in MLLMs through the strategic use of multimodal data. The public code is available at https://github.com/NUS-HPC-AI-Lab/Multimodal-ICL-Retriever."
}
Markdown (Informal)
[How Does the Textual Information Affect the Retrieval of Multimodal In-Context Learning?](https://preview.aclanthology.org/ingest_wac_2008/2024.emnlp-main.305/) (Luo et al., EMNLP 2024)
ACL