@inproceedings{antoine-2026-evaluation,
title = "Evaluation and Assessment as Complementary Frameworks",
author = "Antoine, Elie",
editor = "Mahamood, Saad and
Howcroft, David M. and
van Deemter, Kees and
Balloccu, Simone and
Sivaprasad, Adarsa and
Sundararajan, Barkavi and
Bugar{\'i}n Diz, Alberto and
Alonso-Moral, Jose Mar{\'i}a",
booktitle = "Proceedings of the 1st Symposium on Natural Language Generation Evaluations",
month = jun,
year = "2026",
address = "Aberdeen, United Kingdom",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.3/",
pages = "16--23",
ISBN = "979-8-89176-436-1",
abstract = "Language model capabilities have advanced faster than the methods used to evaluate them, particularly since the move from task-specific systems to general-purpose models which are deployed across an ever-widening range of tasks. When models were built for a single task, evaluation sat in a tight relationship between the task, the data, and the model. General-purpose models have weakened this relationship, and the evaluation practices that were built around it have not adjusted. This paper argues that addressing this gap requires treating evaluation, understood as quantitative performance measurement, and assessment, understood as the analysis of mechanisms and real-world behavior, as complementary rather than interchangeable. This distinction matters because evaluation is now often asked to stand alone in settings where a benchmark score cannot tell us what a model is doing, or how its behavior will hold up outside the benchmark."
}Markdown (Informal)
[Evaluation and Assessment as Complementary Frameworks](https://preview.aclanthology.org/ingest-retroeval/2026.retroeval-main.3/) (Antoine, RetroEval 2026)
ACL