@inproceedings{mainardi-etal-2026-empirical,
title = "An Empirical Study of Speculative Decoding for Small Language Models",
author = "Mainardi, Luca and
Sandikci, Selcuk and
Vanschoren, Joaquin",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.255/",
pages = "5483--5497",
ISBN = "979-8-89176-380-7",
abstract = "Speculative decoding has emerged as a promising approach to accelerate Large Language Model inference. However, existing research has predominantly focused on 7B-70B parameters models, leaving a critical knowledge gap for small language models (1-2B parameters) that are increasingly important for edge computing and agentic AI systems. This paper presents the first comprehensive empirical study of speculative decoding techniques for small language models. We evaluate five distinct method categories across three representative model families and reveal that drafting overhead, rather than draft quality, becomes the primary bottleneck fundamentally limiting acceleration of small models. We demonstrate that traditional independent drafting fails completely due to the suboptimal architecture of available drafters, while self-drafting methods achieve meaningful acceleration only when employing sufficiently efficient draft modules. In contrast, retrieval-based methods with negligible computational overhead yield consistent gains. Based on these insights, we establish practical guidelines for effective small model acceleration."
}Markdown (Informal)
[An Empirical Study of Speculative Decoding for Small Language Models](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.255/) (Mainardi et al., EACL 2026)
ACL
- Luca Mainardi, Selcuk Sandikci, and Joaquin Vanschoren. 2026. An Empirical Study of Speculative Decoding for Small Language Models. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers), pages 5483–5497, Rabat, Morocco. Association for Computational Linguistics.