@inproceedings{liu-zhou-2025-bridging,
title = "Bridging Semantic and Modality Gaps in Zero-Shot Captioning via Retrieval from Synthetic Data",
author = "Liu, Zhiyue and
Zhou, Wenkai",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.754/",
doi = "10.18653/v1/2025.findings-emnlp.754",
pages = "14010--14023",
ISBN = "979-8-89176-335-7",
abstract = "Zero-shot image captioning, which aims to generate image descriptions without relying on annotated data, has recently attracted increasing research interest. Pre-trained text-to-image generation models enable the creation of synthetic pairs solely from text data, while existing methods fall short in mitigating the discrepancy caused by the inability of synthetic images to fully capture the semantics of the textual input, resulting in unreliable cross-modal correspondences. To address this, we propose a retrieval-based framework that leverages only existing synthetic image-text pairs as its search corpus to systematically bridge the gap when using synthetic data for captioning. For the semantic gap between a synthetic image and its input text, our framework retrieves supplementary visual features from similar synthetic examples and integrates them to refine the image embedding. Then, it extracts image-related textual descriptions to mitigate the modality gap during decoding. Moreover, we introduce a plug-and-play visual semantic module that detects visual entities, further facilitating the construction of semantic correspondences between images and text. Experimental results on benchmark datasets demonstrate that our method obtains state-of-the-art results."
}Markdown (Informal)
[Bridging Semantic and Modality Gaps in Zero-Shot Captioning via Retrieval from Synthetic Data](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.754/) (Liu & Zhou, Findings 2025)
ACL