@inproceedings{liu-etal-2022-cross-lingual,
title = "Cross-Lingual Cross-Modal Consolidation for Effective Multilingual Video Corpus Moment Retrieval",
author = "Liu, Jiaheng and
Yu, Tan and
Peng, Hanyu and
Sun, Mingming and
Li, Ping",
editor = "Carpuat, Marine and
de Marneffe, Marie-Catherine and
Meza Ruiz, Ivan Vladimir",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022",
month = jul,
year = "2022",
address = "Seattle, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2022.findings-naacl.142/",
doi = "10.18653/v1/2022.findings-naacl.142",
pages = "1854--1862",
abstract = "Existing multilingual video corpus moment retrieval (mVCMR) methods are mainly based on a two-stream structure. The visual stream utilizes the visual content in the video to estimate the query-visual similarity, and the subtitle stream exploits the query-subtitle similarity. The final query-video similarity ensembles similarities from two streams. In our work, we pro- pose a simple and effective strategy termed as Cross-lingual Cross-modal Consolidation (C3 ) to improve mVCMR accuracy. We adopt the ensemble similarity as the teacher to guide the training of each stream, leading to a more powerful ensemble similarity. Meanwhile, we use the teacher for a specific language to guide the student for another language to exploit the complementary knowledge across languages. Ex- tensive experiments on mTVR dataset demonstrate the effectiveness of our C3 method."
}
Markdown (Informal)
[Cross-Lingual Cross-Modal Consolidation for Effective Multilingual Video Corpus Moment Retrieval](https://preview.aclanthology.org/Author-page-Marten-During-lu/2022.findings-naacl.142/) (Liu et al., Findings 2022)
ACL