@inproceedings{luo-etal-2025-diffskip,
title = "{D}iff{S}kip: Differential Layer Skipping in Large Language Models",
author = "Luo, Xuan and
Wang, Weizhi and
Yan, Xifeng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/mtsummit-25-ingestion/2025.findings-acl.377/",
doi = "10.18653/v1/2025.findings-acl.377",
pages = "7221--7231",
ISBN = "979-8-89176-256-5",
abstract = "Existing Large Language Models (LLMs) enforce uniform computation across all tokens. We analyze the correlation between the input-output difference of self-attention block and Feed-Forward Network (FFN) within the same transformer layer, and find that these two differential vectors are highly correlated. Thus, we propose to dynamically skip the FFN blocks based on the self-attention difference and introduce Diffential Layer Skipping (DiffSkip) to show that LLMs are inherently dynamic-depth models, capable of adjusting computational depth when generating different tokens. DiffSkip employs a lightweight router module to dynamically skip a set of FFN blocks in LLMs and only requires efficient fine-tuning while keeping the whole LLM frozen. Experimental results demonstrate that DiffSkip effectively enables dynamic FFN skipping in decoder-only language models, even in continuous token generation tasks where many layer-skipping methods struggle."
}
Markdown (Informal)
[DiffSkip: Differential Layer Skipping in Large Language Models](https://preview.aclanthology.org/mtsummit-25-ingestion/2025.findings-acl.377/) (Luo et al., Findings 2025)
ACL