@inproceedings{zhang-2025-bridging,
title = "Bridging Multimodal and Video Summarization: A Unified Survey",
author = "Zhang, Haopeng",
editor = "Dong, Yue and
Xiao, Wen and
Zhang, Haopeng and
Zhang, Rui and
Ernst, Ori and
Wang, Lu and
Liu, Fei",
booktitle = "Proceedings of The 5th New Frontiers in Summarization Workshop",
month = nov,
year = "2025",
address = "Hybrid",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.newsum-main.11/",
pages = "157--171",
ISBN = "979-8-89176-337-1",
abstract = "Multimodal summarization (MMS) and video summarization (VS) have traditionally evolved in separate communities{---}natural language processing (NLP) and computer vision (CV), respectively. MMS focuses on generating textual summaries from inputs such as text, images, or audio, while VS emphasizes selecting key visual content. With the recent rise of vision-language models (VLMs), these once-disparate tasks are converging under a unified framework that integrates visual and linguistic understanding.In this survey, we provide a unified perspective that bridges MMS and VS. We formalize the task landscape, review key datasets and evaluation metrics, and categorize major modeling approaches into new taxonomy. In addition, we highlight core challenges and outline future directions toward building general-purpose multimodal summarization systems. By synthesizing insights from both NLP and CV communities, this survey aims to establish a coherent foundation for advancing this rapidly evolving field."
}Markdown (Informal)
[Bridging Multimodal and Video Summarization: A Unified Survey](https://preview.aclanthology.org/ingest-emnlp/2025.newsum-main.11/) (Zhang, NewSum 2025)
ACL