@inproceedings{yang-etal-2024-cf,
title = "{CF}-{TCIR}: A Compositor-Free Framework for Hierarchical Text-Conditioned Image Retrieval",
author = "Yang, Yuchen and
Wang, Yu and
Wang, Yanfeng",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.findings-acl.965/",
doi = "10.18653/v1/2024.findings-acl.965",
pages = "16315--16325",
abstract = "In text-conditioned image retrieval (TCIR), the combination of a reference image and modification text forms a query tuple, aiming to locate the most congruent target image within a dataset. The advantages of rich image semantic information and text flexibility are combined in this manner for more accurate retrieval. While traditional techniques often employ attention-driven compositors to craft a unified image-text representation, our paper introduces a compositor-free framework, CF-TCIR, which eschews the standard compositor. Compositor-based methods are designed to learn a joint representation of images and text, but they struggle to directly capture the correlations between attributes across the image and text modalities. Instead, we reformulate the retrieval process as a cross-modal interaction between a synthesized image feature and its corresponding text descriptor. This novel methodology offers advantages in terms of computational efficiency, scalability, and superior performance. To optimize the retrieval performance, we advocate a tiered retrieval mechanism, blending both coarse-grain and fine-grain paradigms. Moreover, to enrich the contextual relationship within the query tuple, we integrate a generative cross-modal alignment technique, ensuring synchronization of sequential attributes between image and text data."
}
Markdown (Informal)
[CF-TCIR: A Compositor-Free Framework for Hierarchical Text-Conditioned Image Retrieval](https://preview.aclanthology.org/Author-page-Marten-During-lu/2024.findings-acl.965/) (Yang et al., Findings 2024)
ACL