@inproceedings{liu-etal-2021-progressively,
title = "Progressively Guide to Attend: An Iterative Alignment Framework for Temporal Sentence Grounding",
author = "Liu, Daizong and
Qu, Xiaoye and
Zhou, Pan",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.733/",
doi = "10.18653/v1/2021.emnlp-main.733",
pages = "9302--9311",
abstract = "A key solution to temporal sentence grounding (TSG) exists in how to learn effective alignment between vision and language features extracted from an untrimmed video and a sentence description. Existing methods mainly leverage vanilla soft attention to perform the alignment in a single-step process. However, such single-step attention is insufficient in practice, since complicated relations between inter- and intra-modality are usually obtained through multi-step reasoning. In this paper, we propose an Iterative Alignment Network (IA-Net) for TSG task, which iteratively interacts inter- and intra-modal features within multiple steps for more accurate grounding. Specifically, during the iterative reasoning process, we pad multi-modal features with learnable parameters to alleviate the nowhere-to-attend problem of non-matched frame-word pairs, and enhance the basic co-attention mechanism in a parallel manner. To further calibrate the misaligned attention caused by each reasoning step, we also devise a calibration module following each attention module to refine the alignment knowledge. With such iterative alignment scheme, our IA-Net can robustly capture the fine-grained relations between vision and language domains step-by-step for progressively reasoning the temporal boundaries. Extensive experiments conducted on three challenging benchmarks demonstrate that our proposed model performs better than the state-of-the-arts."
}
Markdown (Informal)
[Progressively Guide to Attend: An Iterative Alignment Framework for Temporal Sentence Grounding](https://preview.aclanthology.org/fix-sig-urls/2021.emnlp-main.733/) (Liu et al., EMNLP 2021)
ACL