@inproceedings{you-etal-2025-cross,
title = "Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning",
author = "You, Jingyi and
Sasaki, Hiroshi and
Kadowaki, Kazuma",
editor = "Kriz, Reno and
Murray, Kenton",
booktitle = "Proceedings of the 1st Workshop on Multimodal Augmented Generation via Multimodal Retrieval (MAGMaR 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.magmar-1.4/",
pages = "47--58",
ISBN = "979-8-89176-280-0",
abstract = "Recent advances in retrieval-augmented generative image captioning (RAG-IC) have significantly improved caption quality by incorporating external knowledge and similar examples into language model-driven caption generators. However, these methods still encounter challenges when applied to real-world scenarios. First, many existing approaches rely on bimodal retrieval datastores that require large amounts of labeled data and substantial manual effort to construct, making them costly and time-consuming. Moreover, they simply retrieve the nearest samples to the input query from datastores, which leads to high redundancy in the retrieved content and subsequently degrades the quality of the generated captions. In this paper, we introduce a novel RAG-IC approach named \textit{ \textbf{C}r\textbf{o}ss-modal \textbf{Di}versity-promoting \textbf{Ret}rieval technique} (CoDiRet), which integrates a text-only unimodal retrieval module with our unique cluster-based retrieval mechanism. This proposal simultaneously enhances the scalability of the datastore, promotes diversity in retrieved content, and improves robustness against out-of-domain inputs, which eventually facilitates real-world applications. Experimental results demonstrate that our method, despite being exclusively trained on the COCO benchmark dataset, achieves competitive performance on the in-domain benchmark and generalizes robustly across different domains without additional training."
}
Markdown (Informal)
[Cross-modal Clustering-based Retrieval for Scalable and Robust Image Captioning](https://preview.aclanthology.org/landing_page/2025.magmar-1.4/) (You et al., MAGMaR 2025)
ACL