@inproceedings{jung-kim-2025-qeva,
title = "{QEVA}: A Reference-Free Evaluation Metric for Narrative Video Summarization with Multimodal Question Answering",
author = "Jung, Woojun and
Kim, Junyeong",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1340/",
doi = "10.18653/v1/2025.findings-emnlp.1340",
pages = "24632--24642",
ISBN = "979-8-89176-335-7",
abstract = "Video-to-text summarization remains underexplored in terms of comprehensive evaluation methods. Traditional n-gram overlap-based metrics and recent large language model (LLM)-based approaches depend heavily on human-written reference summaries, limiting their practicality and sensitivity to nuanced semantic aspects. In this paper, we propose QEVA, a reference-free metric evaluating candidate summaries directly against source videos through multimodal question answering. QEVA assesses summaries along three clear dimensions: Coverage, Factuality, and Temporal Coherence. We also introduce MLVU(VS)-Eval, a new annotated benchmark derived from the MLVU dataset, comprising 800 summaries generated from 200 videos using state-of-the-art video-language multimodal models. This dataset establishes a transparent and consistent framework for evaluation. Experimental results demonstrate that QEVA shows higher correlation with human judgments compared to existing approaches, as measured by Kendall{'}s $\tau_b$, $\tau_c$, and Spearman{'}s $\rho$. We hope that our benchmark and metric will facilitate meaningful progress in video-to-text summarization research and provide valuable insights for the development of future evaluation methods."
}Markdown (Informal)
[QEVA: A Reference-Free Evaluation Metric for Narrative Video Summarization with Multimodal Question Answering](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.1340/) (Jung & Kim, Findings 2025)
ACL