@inproceedings{belz-etal-2025-standard,
title = "Standard Quality Criteria Derived from Current {NLP} Evaluations for Guiding Evaluation Design and Grounding Comparability and {AI} Compliance Assessments",
author = "Belz, Anya and
Mille, Simon and
Thomson, Craig",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1370/",
pages = "26685--26715",
ISBN = "979-8-89176-256-5",
abstract = "Research shows that two evaluation experiments reporting results for the same qualitycriterion name (e.g. Fluency) do not necessarily evaluate the same aspect of quality. Notknowing when two evaluations are comparablein this sense means we currently lack the abilityto draw conclusions based on multiple independently conducted evaluations. It is hard to seehow this issue can be fully addressed other thanby the creation of a standard set of quality criterion names and definitions that the evaluationsin use in NLP can be grounded in. Taking a descriptivist approach, the QCET Quality Criteriafor Evaluation Taxonomy derives a standard setof 114 quality criterion names and definitionsfrom three surveys of a combined total of 933evaluation experiments in NLP, and structuresthem into a reference taxonomy. We presentQCET and its uses in (i) establishing comparability of existing evaluations, (ii) guiding thedesign of new evaluations, and (iii) assessingregulation compliance."
}
Markdown (Informal)
[Standard Quality Criteria Derived from Current NLP Evaluations for Guiding Evaluation Design and Grounding Comparability and AI Compliance Assessments](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1370/) (Belz et al., Findings 2025)
ACL