@inproceedings{bretti-etal-2026-multimodal,
title = "Are Multimodal {LLM}s Movie Buffs?",
author = "Bretti, Carlo and
Mettes, Pascal and
Van Noord, Nanne",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.139/",
pages = "2661--2677",
ISBN = "979-8-89176-386-9",
abstract = "No. While Multimodal Large Language Models (MLLMs) have been shown to perform very well on general video data, we systematically show that their performance on movies lags behind. This is surprising as MLLMs are increasingly used for movie understanding. To measure the performance of MLLMs on movies, we explore three pillars of movie mastery: movie knowledge, cinematographic knowledge, and critical analysis. Through a combination of quantitative and in-depth qualitative evaluations, we identify where MLLMs show promise and, in particular, where they fail. Our findings show that in small-scale settings involving factual knowledge, MLLMs are able to outperform existing methods. However, once cinematographic and critical analysis is required, MLLMs are insufficiently able to extract meaningful information from the visual modality to be able to provide useful insights. The data and project page are available at https://carlobretti.github.io/moviebuff."
}Markdown (Informal)
[Are Multimodal LLMs Movie Buffs?](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.139/) (Bretti et al., Findings 2026)
ACL
- Carlo Bretti, Pascal Mettes, and Nanne Van Noord. 2026. Are Multimodal LLMs Movie Buffs?. In Findings of the Association for Computational Linguistics: EACL 2026, pages 2661–2677, Rabat, Morocco. Association for Computational Linguistics.