@inproceedings{liu-etal-2021-inflate,
title = "Inflate and Shrink:Enriching and Reducing Interactions for Fast Text-Image Retrieval",
author = "Liu, Haoliang and
Yu, Tan and
Li, Ping",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.772/",
doi = "10.18653/v1/2021.emnlp-main.772",
pages = "9796--9809",
abstract = "By exploiting the cross-modal attention, cross-BERT methods have achieved state-of-the-art accuracy in cross-modal retrieval. Nevertheless, the heavy text-image interactions in the cross-BERT model are prohibitively slow for large-scale retrieval. Late-interaction methods trade off retrieval accuracy and efficiency by exploiting cross-modal interaction only in the late stage, attaining a satisfactory retrieval speed. In this work, we propose an inflating and shrinking approach to further boost the efficiency and accuracy of late-interaction methods. The inflating operation plugs several codes in the input of the encoder to exploit the text-image interactions more thoroughly for higher retrieval accuracy. Then the shrinking operation gradually reduces the text-image interactions through knowledge distilling for higher efficiency. Through an inflating operation followed by a shrinking operation, both efficiency and accuracy of a late-interaction model are boosted. Systematic experiments on public benchmarks demonstrate the effectiveness of our inflating and shrinking approach."
}
Markdown (Informal)
[Inflate and Shrink:Enriching and Reducing Interactions for Fast Text-Image Retrieval](https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.772/) (Liu et al., EMNLP 2021)
ACL