@inproceedings{jo-etal-2026-task,
title = "Task-aware Block Pruning with Output Distribution Signals for Large Language Models",
author = "Jo, Song-ha and
Ko, Youngrok and
Lee, Sang-goo and
Seol, Jinseok",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.320/",
pages = "6089--6107",
ISBN = "979-8-89176-386-9",
abstract = "Large language models (LLMs) provide excellent performance, but their practical deployment is limited by the substantial compute and memory demands of large models and the latency of auto-regressive decoding. To mitigate these inefficiencies, block pruning reduces the number of executed transformer blocks, effectively lowering latency while preserving architectural coherence. However, existing methods typically rely on representation similarity or computationally expensive sensitivity analyses to estimate block importance, thereby neglecting task-aware model behavior. To address this limitation, we introduce Task-aware Block Pruning (TaBP), a novel approach that directly captures task-specific inference dynamics by quantifying block-level uncertainty from the statistics of each block{'}s early-exited output distribution on a calibration dataset. Since output distributions reflect the model{'}s confidence and decision uncertainty conditioned on downstream tasks, these statistics provide a principled signal for identifying blocks that are less critical for task performance. Extensive experiments demonstrate that TaBP preserves downstream task performance while substantially reducing inference latency and computational cost, without relying on cost-heavy sensitivity analyses. To facilitate reproducibility and further research, we release our implementation of TaBP on [GitHub](https://github.com/Song-haJo/TaBP)."
}Markdown (Informal)
[Task-aware Block Pruning with Output Distribution Signals for Large Language Models](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.320/) (Jo et al., Findings 2026)
ACL