@inproceedings{schmidtova-etal-2025-real,
title = "Real-World Summarization: When Evaluation Reaches Its Limits",
author = "Schmidtov{\'a}, Patr{\'i}cia and
Dusek, Ondrej and
Mahamood, Saad",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1363/",
doi = "10.18653/v1/2025.findings-emnlp.1363",
pages = "25014--25026",
ISBN = "979-8-89176-335-7",
abstract = "We examine evaluation of faithfulness to input data in the context of hotel highlights{---}brief LLM-generated summaries that capture unique features of accommodations. Through human evaluation campaigns involving categorical error assessment and span-level annotation, we compare traditional metrics, trainable methods, and LLM-as-a-judge approaches. Our findings reveal that simpler metrics like word overlap correlate surprisingly well with human judgments (r=0.63), often outperforming more complex methods when applied to out-of-domain data. We further demonstrate that while LLMs can generate high-quality highlights, they prove unreliable for evaluation as they tend to severely under- or over-annotate. Our analysis of real-world business impacts shows incorrect and non-checkable information pose the greatest risks. We also highlight challenges in crowdsourced evaluations."
}Markdown (Informal)
[Real-World Summarization: When Evaluation Reaches Its Limits](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1363/) (Schmidtová et al., Findings 2025)
ACL