@inproceedings{wu-etal-2026-visret,
title = "{V}is{R}et: Visualization Improves Knowledge-Intensive Text-to-Image Retrieval",
author = "Wu, Di and
Wan, Yixin and
Chang, Kai-Wei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1192/",
pages = "25982--26001",
ISBN = "979-8-89176-390-6",
abstract = "Text-to-image retrieval (T2I retrieval) remains challenging because cross-modal embeddings often behave as bags of concepts, underrepresenting structured visual relationships such as pose and viewpoint. We proposeVisualize-then-Retrieve (VisRet), a retrieval paradigm that mitigates this limitation of cross-modal similarity alignment. VisRet first projects textual queries into the image modality via T2I generation, then performs retrieval within the image modality to bypass the weaknesses of cross-modal retrievers in recognizing subtle visual-spatial features. Across four benchmarks (Visual-RAG, INQUIRE-Rerank, Microsoft COCO, and our new Visual-RAG-ME featuring multi-entity comparisons), VisRet substantially outperforms cross-modal similarity matching and baselines that recast T2I retrieval as text-to-text similarity matching, improving nDCG@30 by 0.125 on average with CLIP as the retriever and by 0.121 with E5-V. For downstream question answering, VisRet increases accuracy on Visual-RAG and Visual-RAG-ME by 3.8{\%} and 15.7{\%} in top-1 retrieval, and by 3.9{\%} and 11.1{\%} in top-10 retrieval. Ablation studies show compatibility with different T2I instruction LLMs, T2I generation models, and downstream LLMs. VisRet provides a simple yet effective perspective for advancing in text-image retrieval. Our code and the new benchmark are publicly available at https://github.com/xiaowu0162/Visualize-then-Retrieve."
}Markdown (Informal)
[VisRet: Visualization Improves Knowledge-Intensive Text-to-Image Retrieval](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1192/) (Wu et al., ACL 2026)
ACL