@inproceedings{belz-etal-2026-mapping,
title = "Mapping Out the {NLP} Evaluation Landscape with a Standard Taxonomy of Quality Criteria",
author = "Belz, Anya and
Mille, Simon and
Thomson, Craig",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.77/",
pages = "999--1014",
ISBN = "979-8-89176-423-1",
abstract = "Prior research shows that when papers reportresults from system evaluations in terms ofa quality criterion such as Fluency, answersto two questions are normally less clear thanthey should be: (i) was it really Fluency thatwas evaluated; and (ii) was the same aspect ofquality evaluated as in other evaluations alsoclaiming to evaluate Fluency. Answers to thesequestions are crucial if meaningful conclusionsabout the Fluency of systems, independentlyand as compared to others, are to be drawn.We map a combined total of 1,002 individualevaluations identified in three surveys of 310NLP papers to the standardised QCET inven-tory of quality criterion names and definitions.Standardisation results in up to 76{\%} reductionin evaluation criteria names, revealing a lot ofspurious difference in evaluation naming. Weargue that conclusions drawn from NLP sys-tem evaluations are only fully interpretable andcomparable if grounding in a standard inven-tory of quality criterion names and definitionsforms part of experiment design and reporting,and we propose a way of achieving this."
}Markdown (Informal)
[Mapping Out the NLP Evaluation Landscape with a Standard Taxonomy of Quality Criteria](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.77/) (Belz et al., GEM 2026)
ACL