@article{albatarni-elsayed-2026-quadratic,
title = "Quadratic Weighted Kappa Is Not Enough for Evaluating Automated Essay Scoring Models",
author = "Albatarni, Salam and
Elsayed, Tamer",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.348/",
pages = "4447--4456",
abstract = "Quadratic Weighted Kappa (QWK) has been the standard evaluation metric in Automated Essay Scoring (AES) research for over two decades. Despite repeated criticisms highlighting its limitations, the community has largely continued to rely on QWK without adopting alternative metrics. This study aims to encourage a shift toward more suitable evaluation practices by systematically examining QWK{'}s behavior under three key conditions: dataset size, class imbalance, and score range. Using both a publicly available AES dataset and carefully synthesized datasets, we demonstrate scenarios where QWK produces unstable or misleading results. Our findings highlight the need for more robust evaluation practices and point to alternative metrics, particularly variants of Gwet{'}s AC2, that offer greater reliability across a variety of conditions."
}Markdown (Informal)
[Quadratic Weighted Kappa Is Not Enough for Evaluating Automated Essay Scoring Models](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.348/) (Albatarni & Elsayed, LREC 2026)
ACL