@inproceedings{yoshida-2025-reasoning,
title = "Are the Reasoning Models Good at Automated Essay Scoring?",
author = "Yoshida, Lui",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.445/",
doi = "10.18653/v1/2025.findings-emnlp.445",
pages = "8388--8394",
ISBN = "979-8-89176-335-7",
abstract = "This study investigates the validity and reliability of reasoning models, specifically OpenAI{'}s o3-mini and o4-mini, in automated essay scoring (AES) tasks. We evaluated these models' performance on the TOEFL11 dataset by measuring agreement with expert ratings (validity) and consistency in repeated evaluations (reliability). Our findings reveal two key results: (1) the validity of reasoning models o3-mini and o4-mini is significantly lower than that of a non-reasoning model GPT-4o mini, and (2) the reliability of reasoning models cannot be considered high, with Intraclass Correlation Coefficients (ICC) of approximately 0.7 compared to GPT-4o mini{'}s 0.95. These results demonstrate that reasoning models, despite their excellent performance on many benchmarks, do not necessarily perform well on specific tasks such as AES. Additionally, we found that few-shot prompting significantly improves performance for reasoning models, while Chain of Thought (CoT) has less impact."
}Markdown (Informal)
[Are the Reasoning Models Good at Automated Essay Scoring?](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.445/) (Yoshida, Findings 2025)
ACL