@inproceedings{trott-parkinson-coombs-2026-graduating,
title = "Graduating the Benchmark Scale: Lessons from Thermometry",
author = "Trott, Sean and
Parkinson-Coombs, Ois{\'i}n",
editor = "Akhtar, Mubashara and
Batzner, Jan and
Choshen, Leshem and
Ghosh, Avijit and
Gohar, Usman and
Mickel, Jennifer and
Pant, Ichhya and
Talat, Zeerak and
Lin, Michelle",
booktitle = "Proceedings of the Workshop on Evaluating Evaluations ({E}val{E}val)",
month = jul,
year = "2026",
address = "San Diego, CA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.21/",
pages = "111--115",
ISBN = "979-8-89176-429-3",
abstract = "Benchmarks for assessing large language model (LLM) capabilities have been criticized for a lack of construct validity. Here, we focus on an often overlooked dimension of a benchmark{'}s validity: namely, the functional mapping between a benchmark{'}s numerical score and the underlying quantity the benchmark purports to measure. What licenses the assumption that equivalent intervals on a scale correspond to equivalent differences in the underlying capability? We argue that this question is not merely theoretical: the form of this mapping (e.g., linear vs. logarithmic vs. exponential) could and should influence decisions about deployment and regulatory policy. Drawing on work from the history and philosophy of science, we discuss an analogous problem in the early history of thermometry termed the problem of nomic measurement, as well as the epistemic practices that enabled scientists to overcome these challenges. We then ask whether a similar process of epistemic iteration can overcome this problem in benchmarking. Despite clear differences between temperature and ``capabilities'' as constructs, we argue that some modest success could be achievable in the domain of benchmarking{---}but that this depends crucially on the clear articulation of a researcher{'}s goals and theoretical commitments."
}Markdown (Informal)
[Graduating the Benchmark Scale: Lessons from Thermometry](https://preview.aclanthology.org/ingest-acl-workshops/2026.evaleval-1.21/) (Trott & Parkinson-Coombs, EvalEval 2026)
ACL