@inproceedings{dahan-stanovsky-2025-state,
title = "The State and Fate of Summarization Datasets: A Survey",
author = "Dahan, Noam and
Stanovsky, Gabriel",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.372/",
pages = "7259--7278",
ISBN = "979-8-89176-189-6",
abstract = "Automatic summarization has consistently attracted attention due to its versatility and wide application in various downstream tasks. Despite its popularity, we find that annotation efforts have largely been disjointed, and have lacked common terminology. Consequently, it is challenging to discover existing resources or identify coherent research directions. To address this, we survey a large body of work spanning 133 datasets in over 100 languages, creating a novel ontology covering sample properties, collection methods and distribution. With this ontology we make key observations, including the lack of accessible high-quality datasets for low-resource languages, and the field{'}s overreliance on the news domain and on automatically collected distant supervision. Finally, we make available a web interface that allows users to interact and explore our ontology and dataset collection, as well as a template for a summarization data card, which can be used to streamline future research into a more coherent body of work."
}
Markdown (Informal)
[The State and Fate of Summarization Datasets: A Survey](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.372/) (Dahan & Stanovsky, NAACL 2025)
ACL
- Noam Dahan and Gabriel Stanovsky. 2025. The State and Fate of Summarization Datasets: A Survey. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7259–7278, Albuquerque, New Mexico. Association for Computational Linguistics.