@inproceedings{yu-etal-2022-ghan,
title = "{GHAN}: Graph-Based Hierarchical Aggregation Network for Text-Video Retrieval",
author = "Yu, Yahan and
Hu, Bojie and
Li, Yu",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.emnlp-main.374/",
doi = "10.18653/v1/2022.emnlp-main.374",
pages = "5547--5557",
abstract = "Text-video retrieval focuses on two aspects: cross-modality interaction and video-language encoding. Currently, the mainstream approach is to train a joint embedding space for multimodal interactions. However, there are structural and semantic differences between text and video, making this approach challenging for fine-grained understanding. In order to solve this, we propose an end-to-end graph-based hierarchical aggregation network for text-video retrieval according to the hierarchy possessed by text and video. We design a token-level weighted network to refine intra-modality representations and construct a graph-based message passing attention network for global-local alignment across modality. We conduct experiments on the public datasets MSR-VTT-9K, MSR-VTT-7K and MSVD, and achieve Recall@1 of 73.0{\%}, 65.6{\%}, and 64.0{\%} , which is 25.7{\%}, 16.5{\%}, and 14.2{\%} better than the current state-of-the-art model."
}
Markdown (Informal)
[GHAN: Graph-Based Hierarchical Aggregation Network for Text-Video Retrieval](https://preview.aclanthology.org/fix-sig-urls/2022.emnlp-main.374/) (Yu et al., EMNLP 2022)
ACL