@inproceedings{lee-etal-2026-generative,
title = "Generative-Evaluative Agreement: A Necessary Validity Criterion for {LLM}-Enabled Adaptive Assessment",
author = "Lee, Grandee and
Wang, Yue and
Lye, Che Yee and
Peh, Luke",
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.54/",
pages = "798--812",
ISBN = "979-8-89176-409-5",
abstract = "When the same LLM generates assessment items, simulates student responses, and scores them, the validation loop is self-referential. We introduce Generative-Evaluative Agreement (GEA), a validity criterion measuring whether an LLM{'}s scoring function recovers the skill levels its generative function was instructed to produce. In the first direct measurement of GEA on a two-stage adaptive assessment, the model recovers roughly half the intended variance (r = 0.698) with systematic positive bias. GEA is strong (r {\ensuremath{>}} 0.7) for syntactically verifiable skills but near zero for design-level skills, and low-skill overestimation inflates scores near the routing threshold. We argue that granular, skill-decomposed rubrics are the principal proposed mechanism for strengthening GEA and outline complementary mitigations."
}Markdown (Informal)
[Generative-Evaluative Agreement: A Necessary Validity Criterion for LLM-Enabled Adaptive Assessment](https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.54/) (Lee et al., BEA 2026)
ACL