@inproceedings{zhang-etal-2026-pdtrim,
title = "{PDT}rim: Targeted Pruning for Prefill-Decode Disaggregation in Inference",
author = "Zhang, Hao and
Mengsi, Lyu and
Chen, Zhuo and
Ao, Yulong and
Lin, Yonghua",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1842/",
pages = "39673--39690",
ISBN = "979-8-89176-390-6",
abstract = "Large Language Models (LLMs) demonstrate exceptional capabilities across various tasks, but their deployment is constrained by high computational and memory costs. Model pruning provides an effective means to alleviate these demands. However, existing methods often ignore the characteristics of prefill-decode (PD) disaggregation in practice. In this paper, we propose a pruning method that is highly integrated with PD disaggregation, enabling more precise pruning of blocks. Our approach constructs pruning and distillation sets to perform iterative block removal, obtaining better pruning solutions. Moreover, we analyze the pruning sensitivity of the prefill and decode stages and identify removable blocks specific to each stage, making it well suited for PD disaggregation deployment. Extensive experiments demonstrate our approach consistently achieves strong performance in both PD disaggregation and PD unified (non-PD disaggregation) settings, and can also be extended to other non-block pruning methods. Under the same settings, our method achieves improved performance and faster inference."
}Markdown (Informal)
[PDTrim: Targeted Pruning for Prefill-Decode Disaggregation in Inference](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1842/) (Zhang et al., ACL 2026)
ACL
- Hao Zhang, Lyu Mengsi, Zhuo Chen, Yulong Ao, and Yonghua Lin. 2026. PDTrim: Targeted Pruning for Prefill-Decode Disaggregation in Inference. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 39673–39690, San Diego, California, United States. Association for Computational Linguistics.