@inproceedings{zhang-liu-2025-staged,
title = "Staged Knowledge Distillation Through Least-to-Most Prompting: Optimizing Teacher Guidance via Difficulty-Aware Training",
author = "Zhang, Mengxiang and
Liu, Lingyuan",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.451/",
doi = "10.18653/v1/2025.findings-emnlp.451",
pages = "8489--8501",
ISBN = "979-8-89176-335-7",
abstract = "Knowledge distillation (KD) enables the compression of large language models (LLMs) by transferring knowledge from a high-capacity teacher model to a resource-efficient student model, maintaining competitive performance for tasks such as instruction following. However, conventional white-box KD methods often suffer from training-inference mismatches and suboptimal performance due to the asymmetric nature of Kullback-Leibler divergence (KLD) and reliance on computationally expensive student-generated outputs. To address these challenges, we propose Least-to-Most Prompting Knowledge Distillation (L2M-KD), a novel white-box KD method grounded in curriculum learning (CL) and adaptive loss design. L2M-KD employs a two-pronged approach: (1) a CL strategy that ranks training samples by difficulty using Rouge-L scores, partitioning them into easy-to-hard subsets across multiple stages, and (2) an adaptive KD loss that transitions from KLD to skew KLD, dynamically adjusting teacher guidance to mitigate mode-averaging and over-smoothing. Extensive experiments on instruction-following tasks demonstrate that L2M-KD outperforms existing white-box KD methods, achieving superior student model performance with reduced computational overhead by leveraging ground-truth outputs exclusively. Our findings underscore the efficacy of difficulty-aware training and adaptive teacher guidance, offering a computationally efficient and robust approach to LLM compression."
}Markdown (Informal)
[Staged Knowledge Distillation Through Least-to-Most Prompting: Optimizing Teacher Guidance via Difficulty-Aware Training](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.451/) (Zhang & Liu, Findings 2025)
ACL