@inproceedings{catapang-2026-position,
title = "Position: Toward a Metric Typology for Language Model Evaluation",
author = "Catapang, Jasper Kyle",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.78/",
pages = "1015--1020",
ISBN = "979-8-89176-423-1",
abstract = "The critique of scalar benchmark rankings as proxies for model quality is now well-established (Raji et al., 2021; Wallach et al.,2025; Bean et al., 2025; Gehrmann et al., 2021). What the field still lacks is a shared structural vocabulary for comparing, combining, and contextualizing metric design choices. This paper provides that vocabulary: a four-primitive typology{---}representation ($\phi$), comparison ($D$), aggregation ($A$), and context ($C$){---}under which existing metrics (BLEU, BERTScore, nDCG, LLM-as-judge, calibration scores, agentic outcome measures) are explicit parameterizations of a common form. This typology is paired with a measurement{--}decision split: metrics are noisy estimators of latent constructs, and model selection is context-dependent Pareto optimization over construct estimates, not over raw scores. The typology makes implicit metric assumptions comparable and debatable rather than hidden inside a single number."
}Markdown (Informal)
[Position: Toward a Metric Typology for Language Model Evaluation](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.78/) (Catapang, GEM 2026)
ACL