@inproceedings{zaghouani-2026-position,
title = "Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation",
author = "Zaghouani, Wajdi",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.79/",
pages = "1021--1028",
ISBN = "979-8-89176-423-1",
abstract = "The field of natural language generation has accumulated a rich ecosystem of automatic evaluation metrics, yet it lacks a coherent theory of what those metrics are actually measuring. Drawing on measurement theory from the quantitative social sciences, this paper argues that current NLG evaluation practices suffer from a fundamental construct validity problem: metrics are treated as proxies for output quality without explicit specification of the underlying constructs they are meant to operationalize. We examine four dominant evaluation paradigms (reference-based metrics, embedding-based metrics, LLM-as-judge, and human evaluation) and demonstrate that each conflates construct definition with operationalization. Building on a long psychometric tradition reaching back to Cronbach and Meehl (1955) and on recent NLP work that has begun to apply this tradition to bias measurement, dialogue evaluation, and benchmark design, we propose that the field adopt a measurement modeling perspective for NLG evaluation. We borrow the concepts of construct validity, reliability, and consequential validity as a foundation for more principled evaluation, and we outline a preliminary taxonomy of NLG quality constructs as a starting point for this work."
}Markdown (Informal)
[Position: What Are We Measuring? Rethinking Evaluation in Natural Language Generation](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.79/) (Zaghouani, GEM 2026)
ACL