@inproceedings{shi-etal-2026-beyond,
title = "Beyond Pedagogical Principles: Multi-Horizon Preference Optimization for Efficient Socratic Tutoring",
author = "Shi, Xin and
Zhang, Chao and
Zhu, Yifan and
Zhang, Xueqiao and
Luo, Yawei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.518/",
pages = "11289--11306",
ISBN = "979-8-89176-390-6",
abstract = "The development of LLM-based tutor agents faces challenges in simultaneously ensuring adherence to pedagogical principles and achieving optimal pedagogical effectiveness, particularly in dynamic, multi-turn interactions. Existing methods are often constrained by static data or sparse reward signals in online settings. To address this gap, we propose $\textbf{M}$ulti-$\textbf{H}$orizon $\textbf{P}$reference $\textbf{O}$ptimization ($\textbf{MHPO}$), a novel framework that iteratively refines tutor agents using a multi-horizon reward function within a dynamic teacher-student simulation environment. Specifically, this reward function is designed to capture both turn-level pedagogical quality and trajectory-level pedagogical effectiveness, which is estimated via Monte Carlo rollouts. We further investigate two distinct strategies to aggregate these rewards for policy optimization. Our experiments demonstrate that MHPO significantly enhances base model performance, achieving a superior balance between principles and effectiveness compared to various baselines."
}Markdown (Informal)
[Beyond Pedagogical Principles: Multi-Horizon Preference Optimization for Efficient Socratic Tutoring](https://preview.aclanthology.org/ingest-acl/2026.acl-long.518/) (Shi et al., ACL 2026)
ACL