@inproceedings{li-etal-2024-eagle,
title = "{EAGLE}-2: Faster Inference of Language Models with Dynamic Draft Trees",
author = "Li, Yuhui and
Wei, Fangyun and
Zhang, Chao and
Zhang, Hongyang",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.422/",
doi = "10.18653/v1/2024.emnlp-main.422",
pages = "7421--7432",
abstract = "Inference with modern Large Language Models (LLMs) is expensive and time-consuming, and speculative sampling has proven to be an effective solution. Most speculative sampling methods such as EAGLE use a static draft tree, implicitly assuming that the acceptance rate of draft tokens depends only on their position. Interestingly, we found that the acceptance rate of draft tokens is also context-dependent. In this paper, building upon EAGLE, we propose EAGLE-2, which introduces a new technique of context-aware dynamic draft tree into drafting modeling. This improvement leverages the fact that the draft model of EAGLE is well-calibrated: the confidence scores from the draft model approximate acceptance rates with small errors. We conducted extensive evaluations on three series of LLMs and six tasks, with EAGLE-2 achieving speedup ratios of up to **5x**, which is 1.3x that of EAGLE. EAGLE-2 also ensures that the distribution of the generated text remains unchanged, making it a **lossless** acceleration algorithm."
}
Markdown (Informal)
[EAGLE-2: Faster Inference of Language Models with Dynamic Draft Trees](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.422/) (Li et al., EMNLP 2024)
ACL