@inproceedings{jang-etal-2026-stable,
title = "Stable On-Policy Distillation through Adaptive Target Reformulation",
author = "Jang, Ijun and
Yeom, Jewon and
Yeo, Juan and
Lim, Hyunggyu and
Kim, Taesup",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2094/",
pages = "42217--42227",
ISBN = "979-8-89176-395-1",
abstract = "Knowledge distillation (KD) is a widely adopted technique for transferring capabilities from large language models to smaller student models. However, conventional supervised KD often suffers from a distribution mismatch between training and inference. While on-policy KD approaches attempt to mitigate this issue by learning directly from student-generated outputs, they frequently encounter training instabilities and noisy teacher feedback during early optimization stages. These challenges manifest as pathological gradients in forward KL objectives when students encounter unfamiliar tokens, or as a collapse in distributional diversity within reverse KL regimes. To address these limitations, we propose Veto, an objective-level reformulation that constructs a geometric target distribution in logit space to emphasize agreement between the teacher and the student. By introducing a tunable parameter $\beta$, Veto serves as an Adaptive Gradient Veto that stabilizes optimization by suppressing harmful gradients on low-confidence tokens, while simultaneously acting as a Decisiveness Knob to balance reward-driven performance with output diversity. Extensive experiments across various reasoning and generation tasks demonstrate that Veto consistently outperforms supervised fine-tuning and existing on-policy baselines."
}Markdown (Informal)
[Stable On-Policy Distillation through Adaptive Target Reformulation](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.2094/) (Jang et al., Findings 2026)
ACL