@inproceedings{chen-etal-2025-c2kd,
title = "{C}2{KD}: Cross-layer and Cross-head Knowledge Distillation for Small Language Model-based Recommendation",
author = "Chen, Xiao and
Ma, Changyi and
Fan, Wenqi and
Zhang, Zhaoxiang and
Qing, Li",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.917/",
pages = "17827--17838",
ISBN = "979-8-89176-256-5",
abstract = "Sequential recommenders predict users' next interactions based on historical behavior and are essential in modern recommendation systems. While Large Language Models (LLMs) show promise, their size and high inference costs limit deployment on resource-constrained devices. Small Language Models (SLMs) provide a more efficient alternative for edge devices, but bridging the recommendation performance gap between LLMs and SLMs remains challenging. Typical approaches like supervised fine-tuning or vanilla knowledge distillation (KD) often lead to suboptimal performance or even negative transfer. Our motivational experiments reveal key issues with vanilla KD methods: feature imitation suffers from redundancy and uneven recommendation ability across layers, while prediction mimicking faces conflicts caused by differing weight distributions of prediction heads. To address these challenges, we propose a simple yet effective framework, C2KD, to transfer task-relevant knowledge from two complementary dimensions. Specifically, our method incorporates: (1) cross-layer feature imitation, which uses a dynamic router to select the most relevant teacher layers and assimilate task-relevant knowledge from the teacher{'}s late layers, allowing the student to concentrate on the teacher{'}s specialized knowledge; and (2) cross-head logit distillation, which maps the intermediate features of the student to the teacher{'}s output head, thereby minimizing prediction discrepancies between the teacher and the student. Extensive experiments across diverse model families demonstrate that our approach enables 1B-parameter SLMs to achieve competitive performance compared to LLMs (e.g., Llama3-8B), offering a practical solution for real-world on-device sequential recommendations."
}
Markdown (Informal)
[C2KD: Cross-layer and Cross-head Knowledge Distillation for Small Language Model-based Recommendation](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.917/) (Chen et al., Findings 2025)
ACL