@inproceedings{kim-etal-2025-stair,
title = "{STAIR}-{AIG}: Optimizing the Automated Item Generation Process through Human-{AI} Collaboration for Critical Thinking Assessment",
author = "Kim, Euigyum and
Li, Seewoo and
Khalil, Salah and
Shin, Hyo Jeong",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.69/",
pages = "920--930",
ISBN = "979-8-89176-270-1",
abstract = "The advent of artificial intelligence (AI) has marked a transformative era in educational measurement and evaluation, particularly in the development of assessment items. Large language models (LLMs) have emerged as promising tools for scalable automatic item generation (AIG), yet concerns remain about the validity of AI-generated items in various domains. To address this issue, we propose STAIR-AIG (Systematic Tool for Assessment Item Review in Automatic Item Generation), a human-in-the-loop framework that integrates expert judgment to optimize the quality of AIG items. To explore the functionality of the tool, AIG items were generated in the domain of critical thinking. Subsequently, the human expert and four OpenAI LLMs conducted a review of the AIG items. The results show that while the LLMs demonstrated high consistency in their rating of the AIG items, they exhibited a tendency towards leniency. In contrast, the human expert provided more variable and strict evaluations, identifying issues such as the irrelevance of the construct and cultural insensitivity. These findings highlight the viability of STAIR-AIG as a structured human-AI collaboration approach that facilitates rigorous item review, thus optimizing the quality of AIG items. Furthermore, STAIR-AIG enables iterative review processes and accumulates human feedback, facilitating the refinement of models and prompts. This, in turn, would establish a more reliable and comprehensive pipeline to improve AIG practices."
}
Markdown (Informal)
[STAIR-AIG: Optimizing the Automated Item Generation Process through Human-AI Collaboration for Critical Thinking Assessment](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.69/) (Kim et al., BEA 2025)
ACL