@inproceedings{xiao-etal-2025-pay,
title = "Pay More Attention to Images: Numerous Images-Oriented Multimodal Summarization",
author = "Xiao, Min and
Zhu, Junnan and
Zhai, Feifei and
Zong, Chengqing and
Zhou, Yu",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.474/",
pages = "9379--9392",
ISBN = "979-8-89176-189-6",
abstract = "Existing multimodal summarization approaches struggle with scenarios involving numerous images as input, leading to a heavy load for readers. Summarizing both the input text and numerous images helps readers quickly grasp the key points of multimodal input. This paper introduces a novel task, Numerous Images-Oriented Multimodal Summarization (NIMMS). To benchmark this task, we first construct the dataset based on a public multimodal summarization dataset. Considering that most existing metrics evaluate summaries from a unimodal perspective, we propose a new Multimodal Information evaluation (M-info) method, measuring the differences between the generated summary and the multimodal input. Finally, we compare various summarization methods on NIMMS and analyze associated challenges. Experimental results have shown that M-info correlates more closely with human judgments than five widely used metrics. Meanwhile, existing models struggle with summarizing numerous images. We hope that this research will shed light on the development of multimodal summarization. Furthermore, our code and dataset will be released to the public."
}
Markdown (Informal)
[Pay More Attention to Images: Numerous Images-Oriented Multimodal Summarization](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.474/) (Xiao et al., NAACL 2025)
ACL
- Min Xiao, Junnan Zhu, Feifei Zhai, Chengqing Zong, and Yu Zhou. 2025. Pay More Attention to Images: Numerous Images-Oriented Multimodal Summarization. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 9379–9392, Albuquerque, New Mexico. Association for Computational Linguistics.