@inproceedings{shin-etal-2025-video,
title = "Do Video Language Models really understand the video contexts?",
author = "Shin, Jeongwan and
Lim, Jinhyeong and
Park, Hyeyoung",
editor = "Ebrahimi, Abteen and
Haider, Samar and
Liu, Emmy and
Haider, Sammar and
Leonor Pacheco, Maria and
Wein, Shira",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop)",
month = apr,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.40/",
pages = "408--417",
ISBN = "979-8-89176-192-6",
abstract = "This paper examines how well visual language models (VLMs) understand video question answering (VideoQA) tasks and generate responses accordingly. Recently, VLMs based on Large Language Models (LLMs) have shown remarkable performance, but the processes of understanding and reasoning in VLMs remain under-explored. To tackle this challenge, we propose Video Understanding and Response Consistency Assessment, VURCA, a framework that incorporates a fine-grained question generation and answering process to measure how well the responses generated by VLMs align with what the model understands. In addition, we introduce an extended benchmark dataset, FgNExT-QA, which builds upon NExT-QA by incorporating more fine-grained VideoQA tasks. FgNExT-QA is designed to evaluate fine-grained understanding in video question answering. Through experiments, we found that despite the strong overall QA performance of VLMs, their understanding of both the video content and the question remains limited. In particular, they exhibit poor video comprehension in fine-grained VideoQA tasks."
}
Markdown (Informal)
[Do Video Language Models really understand the video contexts?](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-srw.40/) (Shin et al., NAACL 2025)
ACL
- Jeongwan Shin, Jinhyeong Lim, and Hyeyoung Park. 2025. Do Video Language Models really understand the video contexts?. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 4: Student Research Workshop), pages 408–417, Albuquerque, USA. Association for Computational Linguistics.