@inproceedings{huang-wilson-2025-evaluating,
title = "Evaluating {LLM}-Based Automated Essay Scoring: Accuracy, Fairness, and Validity",
author = "Huang, Yue and
Wilson, Joshua",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Works in Progress",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.aimecon-wip.9/",
pages = "71--83",
ISBN = "979-8-218-84229-1",
abstract = "This study evaluates large language models (LLMs) for automated essay scoring (AES), comparing prompt strategies and fairness across student groups. We found that well-designed prompting helps LLMs approach traditional AES performance, but both differ from human scores for ELLs{---}the traditional model shows larger overrall gaps, while LLMs show subtler disparities."
}Markdown (Informal)
[Evaluating LLM-Based Automated Essay Scoring: Accuracy, Fairness, and Validity](https://preview.aclanthology.org/ingest-emnlp/2025.aimecon-wip.9/) (Huang & Wilson, AIME-Con 2025)
ACL