@inproceedings{zheng-wang-2025-faster,
title = "Faster Speculative Decoding via Effective Draft Decoder with Pruned Candidate Tree",
author = "Zheng, Huanran and
Wang, Xiaoling",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.486/",
pages = "9856--9868",
ISBN = "979-8-89176-251-0",
abstract = "Speculative Decoding (SD) is a promising method for reducing the inference latency of large language models (LLMs). A well-designed draft model and an effective draft candidate tree construction method are key to enhancing the acceleration effect of SD. In this paper, we first propose the Effective Draft Decoder (EDD), which treats the LLM as a powerful encoder and generates more accurate draft tokens by leveraging the encoding results as soft prompts. Furthermore, we use KL divergence instead of the standard cross-entropy loss to better align the draft model{'}s output with the LLM. Next, we introduce the Pruned Candidate Tree (PCT) algorithm to construct a more efficient candidate tree. Specifically, we found that the confidence scores predicted by the draft model are well-calibrated with the acceptance probability of draft tokens. Therefore, PCT estimates the expected time gain for each node in the candidate tree based on confidence scores and retains only the nodes that contribute to acceleration, pruning away redundant nodes. We conducted extensive experiments with various LLMs across four datasets. The experimental results verify the effectiveness of our proposed method, which significantly improves the performance of SD and reduces the inference latency of LLMs."
}
Markdown (Informal)
[Faster Speculative Decoding via Effective Draft Decoder with Pruned Candidate Tree](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.486/) (Zheng & Wang, ACL 2025)
ACL