@inproceedings{syu-lee-2025-hierarchical,
title = "Hierarchical Speculative Decoding with Dynamic Window",
author = "Syu, Shensian and
Lee, Hung-yi",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.462/",
pages = "8260--8273",
ISBN = "979-8-89176-195-7",
abstract = "Speculative Decoding (SD) utilizes an efficient draft model to generate multiple tokens, which are subsequently verified in parallel by a target model. This approach has shown significant potential for accelerating inference in large language models (LLMs), with performance heavily reliant on the hyperparameter $K${---}the window size. However, previous methods often depend on simple heuristics to select $K$ or dynamically adjust the window size, which may necessitate additional training or careful resource management to avoid competition.To address these challenges, we propose \textbf{H}ierarchical \textbf{S}peculative \textbf{D}ecoding with \textbf{D}ynamic \textbf{W}indow (HSDDW), a straightforward framework that eliminates the need for additional training. Specifically, we introduce a \textit{self-verify} mechanism that enables the draft model to autonomously decide when to stop generating tokens. Additionally, by integrating a hierarchical structure that leverages the capabilities of models of different sizes, we significantly enhance the overall speed of the system.HSDDW demonstrates competitive performance across four datasets, achieving notable speedups of $2.91\times$ on MT-Bench and $2.99\times$ on Alpaca, outperforming existing state-of-the-art methods."
}
Markdown (Informal)
[Hierarchical Speculative Decoding with Dynamic Window](https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.462/) (Syu & Lee, Findings 2025)
ACL