@inproceedings{zhang-liu-2026-calibrated,
title = "Calibrated Progressive Distillation: Co-Designing Curriculum and Target Mixing for Knowledge Distillation of Large Language Models",
author = "Zhang, Mengxiang and
Liu, Lingyuan",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.335/",
pages = "6757--6776",
ISBN = "979-8-89176-395-1",
abstract = "Knowledge distillation (KD) is a key technique for compressing large language models (LLMs), yet it faces challenges stemming from the teacher{--}student capacity gap. While existing KD methods address these challenges either by mixing teacher and student distributions in the distillation target or by using curriculum learning to sequence training from easy to hard examples, they typically design these two strategies independently, missing the opportunity for synergistic co-design. To bridge this gap, we propose Calibrated Progressive Distillation (CPD), a white-box KD framework that co-designs curriculum scheduling and target mixing through a unified difficulty-aware principle. CPD uses a difficulty profile to select epoch-specific subsets that ensure a uniform increase in average difficulty, adapting to the dataset{'}s intrinsic hardness structure. Simultaneously, the mixing coefficient in the distillation target and the distillation temperature are synchronized with this progression, gradually shifting supervision from teacher-dominated to student-informed signals as training advances. Theoretically, CPD ensures bounded gradients and induces an implicit attention shift from easy to hard samples. Empirically, CPD consistently outperforms advanced KD methods across diverse tasks, while reducing training runtime by over 10{\%}. Our work demonstrates that aligning data scheduling with distillation signal design is crucial for effective and efficient LLM distillation."
}Markdown (Informal)
[Calibrated Progressive Distillation: Co-Designing Curriculum and Target Mixing for Knowledge Distillation of Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.335/) (Zhang & Liu, Findings 2026)
ACL