@inproceedings{wang-etal-2025-wonderland,
title = "{W}onderland{\_}{EDU}@{HKU} at {BEA} 2025 Shared Task: Fine-tuning Large Language Models to Evaluate the Pedagogical Ability of {AI}-powered Tutors",
author = "Wang, Deliang and
Yang, Chao and
Chen, Gaowei",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.bea-1.79/",
pages = "1040--1048",
ISBN = "979-8-89176-270-1",
abstract = "The potential of large language models (LLMs) as AI tutors to facilitate student learning has garnered significant interest, with numerous studies exploring their efficacy in educational contexts. Notably, Wang and Chen (2025) suggests that the relationship between AI model performance and educational outcomes may not always be positively correlated; less accurate AI models can sometimes achieve similar educational impacts to their more accurate counterparts if designed into learning activities appropriately. This underscores the need to evaluate the pedagogical capabilities of LLMs across various dimensions, empowering educators to select appropriate dimensions and LLMs for specific analyses and instructional activities. Addressing this imperative, the BEA 2025 workshop initiated a shared task aimed at comprehensively assessing the pedagogical potential of AI-powered tutors. In this task, our team employed parameter-efficient fine-tuning (PEFT) on Llama-3.2-3B to automatically assess the quality of feedback generated by LLMs in student-teacher dialogues, concentrating on mistake identification, mistake location, guidance provision, and guidance actionability. The results revealed that the fine-tuned Llama-3.2-3B demonstrated notable performance, especially in mistake identification, mistake location, and guidance actionability, securing a top-ten ranking across all tracks. These outcomes highlight the robustness and significant promise of the PEFT method in enhancing educational dialogue analysis."
}
Markdown (Informal)
[Wonderland_EDU@HKU at BEA 2025 Shared Task: Fine-tuning Large Language Models to Evaluate the Pedagogical Ability of AI-powered Tutors](https://preview.aclanthology.org/landing_page/2025.bea-1.79/) (Wang et al., BEA 2025)
ACL