@inproceedings{gilda-gilda-2026-position,
title = "Position: Evaluation Scores Are Perishable Knowledge Claims",
author = "Gilda, Sankalp and
Gilda, Shlok",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.80/",
pages = "1029--1035",
ISBN = "979-8-89176-423-1",
abstract = "Evaluation methodologies for language models increasingly combine multiple signals{---}automated metrics, LLM-as-judge ratings, human assessments, and benchmark suite results. When these signals are aggregated via averaging, the resulting evaluation confidence can substantially exceed the reliability of the weakest signal: a phenomenon we call trust inflation in evaluation. We argue that evaluation scores should be treated as epistemic claims with three properties: formality (human evaluation provides stronger evidence than an automated metric), scope (a benchmark result applies to the tested distribution, not universally), and \textit{validity windows} (benchmark results expire as contamination accumulates and distributions shift). Drawing on several converging research traditions{---}chain-of-thought analysis, possibilistic logic, and algebraic theory{---}that establish weakest-link aggregation as the conservative endpoint of a parameterized operator family controlled by a single pessimism parameter, and on concrete lessons from building an evaluation harness for agentic AI, we propose that evaluation results carry explicit metadata{---}formality tier, scope declaration, and expiration date{---}to make their epistemic status transparent. We illustrate the cost of mean aggregation on the public HELM leaderboard: across 54 frontier models on ten scenarios, the top-five models ranked by mean score and by weakest-link are completely disjoint."
}Markdown (Informal)
[Position: Evaluation Scores Are Perishable Knowledge Claims](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.80/) (Gilda & Gilda, GEM 2026)
ACL