@inproceedings{zhang-etal-2025-pretrained,
title = "Pretrained Image-Text Models are Secretly Video Captioners",
author = "Zhang, Chunhui and
Jian, Yiren and
Ouyang, Zhongyu and
Vosoughi, Soroush",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-short.26/",
pages = "292--305",
ISBN = "979-8-89176-190-2",
abstract = "Developing video captioning models is computationally expensive. The dynamic nature of video also complicates the design of multimodal models that can effectively caption these sequences. However, we find that by using minimal computational resources and without complex modifications to address video dynamics, an image-based model can be repurposed to outperform several specialised video captioning systems. Our adapted model demonstrates top-tier performance on major benchmarks, ranking 2nd on MSR-VTT and MSVD, and 3rd on VATEX. We transform it into a competitive video captioner by post-training a typical image captioning model BLIP-2 with only 6,000 video-text pairs and simply concatenating frames{---}significantly fewer data than other methods, which use 2.5 to 144 million pairs. From a resource optimization perspective, this video captioning study focuses on three fundamental factors: optimizing model scale, maximizing data efficiency, and incorporating reinforcement learning. This extensive study demonstrates that a lightweight, image-based adaptation strategy can rival state-of-the-art video captioning systems, offering a practical solution for low-resource scenarios."
}
Markdown (Informal)
[Pretrained Image-Text Models are Secretly Video Captioners](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-short.26/) (Zhang et al., NAACL 2025)
ACL
- Chunhui Zhang, Yiren Jian, Zhongyu Ouyang, and Soroush Vosoughi. 2025. Pretrained Image-Text Models are Secretly Video Captioners. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 292–305, Albuquerque, New Mexico. Association for Computational Linguistics.