@inproceedings{zhou-2026-position,
title = "Position: Scores Without Context? Rethinking the Role of Evaluation in the Era of {LLM}s",
author = "Zhou, Jiawei",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.82/",
pages = "1048--1054",
ISBN = "979-8-89176-423-1",
abstract = "Recent years have seen rapid growth in evaluation and benchmarking in NLP, driven by advances in large language models (LLMs). This growth has shifted evaluation from measuring generalization to tracking capability, often without reference to training assumptions. We argue that this creates a conceptual gap: results are frequently interpreted without considering what models could plausibly have learned, rendering many conclusions scientifically underdetermined. We propose an expectation-aware view, where the informativeness of evaluation depends on its relationship to training data, model design, and tasks. We further distinguish between evaluation for scientific understanding and capability tracking, and provide recommendations for aligning evaluation with its intended purpose in the LLM era."
}Markdown (Informal)
[Position: Scores Without Context? Rethinking the Role of Evaluation in the Era of LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.82/) (Zhou, GEM 2026)
ACL