@inproceedings{belz-etal-2025-standard,
title = "Standard Quality Criteria Derived from Current {NLP} Evaluations for Guiding Evaluation Design and Grounding Comparability and {AI} Compliance Assessments",
author = "Belz, Anya and
Mille, Simon and
Thomson, Craig",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/transition-to-people-yaml/2025.findings-acl.1370/",
doi = "10.18653/v1/2025.findings-acl.1370",
pages = "26685--26715",
ISBN = "979-8-89176-256-5",
abstract = "Research shows that two evaluation experiments reporting results for the same quality criterion name (e.g. Fluency) do not necessarily evaluate the same aspect of quality. Not knowing when two evaluations are comparable in this sense means we currently lack the ability to draw conclusions based on multiple independently conducted evaluations. It is hard to see how this issue can be fully addressed other than by the creation of a standard set of quality criterion names and definitions that the evaluations in use in NLP can be grounded in. Taking a descriptivist approach, the QCET Quality Criteria for Evaluation Taxonomy derives a standard set of 114 quality criterion names and definitions from three surveys of a combined total of 933 evaluation experiments in NLP, and structures them into a reference taxonomy. We present QCET and its uses in (i) establishing comparability of existing evaluations, (ii) guiding the design of new evaluations, and (iii) assessing regulation compliance."
}
Markdown (Informal)
[Standard Quality Criteria Derived from Current NLP Evaluations for Guiding Evaluation Design and Grounding Comparability and AI Compliance Assessments](https://preview.aclanthology.org/transition-to-people-yaml/2025.findings-acl.1370/) (Belz et al., Findings 2025)
ACL