@inproceedings{schmidtova-etal-2026-never,
title = "Never Truly Out of Fashion: A Retrospective Look at Evaluation in {NLG}",
author = "Schmidtov{\'a}, Patr{\'i}cia and
Mahamood, Saad and
Du{\v{s}}ek, Ond{\v{r}}ej",
editor = "Mahamood, Saad and
Howcroft, David M. and
van Deemter, Kees and
Balloccu, Simone and
Sivaprasad, Adarsa and
Sundararajan, Barkavi and
Bugar{\'i}n Diz, Alberto and
Alonso-Moral, Jose Mar{\'i}a",
booktitle = "Proceedings of the 1st Symposium on Natural Language Generation Evaluations",
month = jun,
year = "2026",
address = "Aberdeen, United Kingdom",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.8/",
pages = "63--72",
ISBN = "979-8-89176-436-1",
abstract = "Human evaluation (HE) remains the gold standard for assessing natural language generation (NLG) systems, yet automatic metrics are cheaper and faster, creating mounting pressure to skip it. We ask how evaluation practices have changed as NLG research scales. We analyse 24,291 papers from the ACL Anthology (1952{--}2025) through regular-expression-powered keyword analysis. Before 1990, the majority of NLG papers reported no evaluation at all; today, evaluation is near-universal and HE has held broadly stable over the past decade {--} it has not collapsed. However, large language model (LLM) judges (referred to as LLM-as-a-judge) have emerged rapidly since 2023, and while they currently serve predominantly as a complement rather than a full substitute for human evaluation, a substantial share of papers already use LLM judges without any human validation. Faithfulness has become the fastest-rising evaluation criterion since 2020, coming back into fashion after almost 15 years of decline, tracking the prominence of hallucination research, while criteria such as grammaticality and fluency are receding, suggesting these qualities may increasingly be taken for granted as model outputs improve. Our findings provide a longitudinal baseline for tracking where the field stands."
}Markdown (Informal)
[Never Truly Out of Fashion: A Retrospective Look at Evaluation in NLG](https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.8/) (Schmidtová et al., RetroEval 2026)
ACL