@inproceedings{roh-bang-2025-bea,
title = "bea-jh at {BEA} 2025 Shared Task: Evaluating {AI}-powered Tutors through Pedagogically-Informed Reasoning",
author = "Roh, Jihyeon and
Bang, Jinhyun",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.80/",
pages = "1049--1059",
ISBN = "979-8-89176-270-1",
abstract = "The growing use of large language models (LLMs) for AI-powered tutors in education highlights the need for reliable evaluation of their pedagogical abilities. In this work, we propose a reasoning-based evaluation methodology that leverages pedagogical domain knowledge to assess LLM-generated feedback in mathematical dialogues while providing insights into why a particular evaluation is given. We design structured prompts to invoke pedagogically-informed reasoning from LLMs and compare base model candidates selected for their strengths in reasoning, mathematics, and overall instruction-following. We employ Group Relative Policy Optimization (GRPO), a reinforcement learning method known to improve reasoning performance, to train models to perform evaluation in four pedagogically motivated dimensions, Mistake Identification, Mistake Location, Providing Guidance, and Actionability. Experimental results show that our GRPO-based models consistently outperform the base model and GPT-4.1, and surpass models trained using supervised fine-tuning in three out of four dimensions. Notably, our method achieved top-ranked performance in Actionability and competitive performance in two other dimensions in the BEA 2025 Shared Task under the team name bea-jh, underscoring the value of generating pedagogically grounded rationales for improving the quality of educational feedback evaluation."
}
Markdown (Informal)
[bea-jh at BEA 2025 Shared Task: Evaluating AI-powered Tutors through Pedagogically-Informed Reasoning](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.80/) (Roh & Bang, BEA 2025)
ACL