@inproceedings{wang-etal-2022-contrastive-video,
title = "Contrastive Video-Language Learning with Fine-grained Frame Sampling",
author = "Wang, Zixu and
Zhong, Yujie and
Miao, Yishu and
Ma, Lin and
Specia, Lucia",
editor = "He, Yulan and
Ji, Heng and
Li, Sujian and
Liu, Yang and
Chang, Chua-Hui",
booktitle = "Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers)",
month = nov,
year = "2022",
address = "Online only",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.aacl-main.53/",
doi = "10.18653/v1/2022.aacl-main.53",
pages = "694--705",
abstract = "Despite recent progress in video and language representation learning, the weak or sparse correspondence between the two modalities remains a bottleneck in the area. Most video-language models are trained via pair-level loss to predict whether a pair of video and text is aligned. However, even in paired video-text segments, only a subset of the frames are semantically relevant to the corresponding text, with the remainder representing noise; where the ratio of noisy frames is higher for longer videos. We propose FineCo (Fine-grained Contrastive Loss for Frame Sampling), an approach to better learn video and language representations with a fine-grained contrastive objective operating on video frames. It helps distil a video by selecting the frames that are semantically equivalent to the text, improving cross-modal correspondence. Building on the well established VideoCLIP model as a starting point, FineCo achieves state-of-the-art performance on YouCookII, a text-video retrieval benchmark with long videos. FineCo also achieves competitive results on text-video retrieval (MSR-VTT), and video question answering datasets (MSR-VTT QA and MSR-VTT MC) with shorter videos."
}
Markdown (Informal)
[Contrastive Video-Language Learning with Fine-grained Frame Sampling](https://preview.aclanthology.org/fix-sig-urls/2022.aacl-main.53/) (Wang et al., AACL-IJCNLP 2022)
ACL
- Zixu Wang, Yujie Zhong, Yishu Miao, Lin Ma, and Lucia Specia. 2022. Contrastive Video-Language Learning with Fine-grained Frame Sampling. In Proceedings of the 2nd Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics and the 12th International Joint Conference on Natural Language Processing (Volume 1: Long Papers), pages 694–705, Online only. Association for Computational Linguistics.