@inproceedings{dahan-etal-2026-leveraging,
title = "Leveraging Digitized Newspapers to Collect Summarization Data in Low-Resource Languages",
author = "Dahan, Noam and
Kidron, Omer and
Stanovsky, Gabriel",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.278/",
pages = "5260--5273",
ISBN = "979-8-89176-386-9",
abstract = "High quality summarization data remains scarce in under-represented languages. However, historical newspapers, made available through recent digitization efforts, offer an abundant source of untapped, naturally annotated data. In this work, we present a novel method for collecting naturally occurring summaries via Front-Page Teasers, where editors summarize full length articles. We show that this phenomenon is common across seven diverse languages and supports multi-document summarization. To scale data collection, we develop an automatic process, suited to varying linguistic resource levels. Finally, we apply this process to a Hebrew newspaper title, producing HEBTEASESUM, the first dedicated multi-document summarization dataset in Hebrew."
}Markdown (Informal)
[Leveraging Digitized Newspapers to Collect Summarization Data in Low-Resource Languages](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.278/) (Dahan et al., Findings 2026)
ACL