@inproceedings{huang-etal-2026-jakiro,
title = "Jakiro: Boosting Speculative Decoding via Decoupled {M}o{E}",
author = "Huang, Haiduo and
Yang, Fuwei and
Liu, Zhenhua and
Ren, Pengju",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.487/",
pages = "10649--10668",
ISBN = "979-8-89176-390-6",
abstract = "Speculative decoding has emerged as a promising technique to accelerate large language model inference by employing a smaller draft model to predict multiple tokens, which are then verified in parallel by the larger target model. However, existing approaches face a fundamental limitation: candidates at the same tree layer share identical feature representations, constraining diversity and diminishing overall effectiveness. We identify this as an intra-layer coupling problem that limits prediction accuracy. To address this challenge, we propose Jakiro, which introduces decoupled Mixture of Experts (MoE) into the draft model, enabling different experts to generate diverse candidate tokens from distinct feature spaces. We further propose Contrastive-Enhanced Parallel Decoding (CEPD) that combines autoregressive and parallel decoding with a contrastive mechanism to reduce inference steps while maintaining accuracy. Extensive experiments across diverse models and tasks demonstrate that Jakiro achieves significant speedups over strong baselines, with particularly notable improvements in non-greedy decoding scenarios where token diversity is crucial."
}Markdown (Informal)
[Jakiro: Boosting Speculative Decoding via Decoupled MoE](https://preview.aclanthology.org/ingest-acl/2026.acl-long.487/) (Huang et al., ACL 2026)
ACL
- Haiduo Huang, Fuwei Yang, Zhenhua Liu, and Pengju Ren. 2026. Jakiro: Boosting Speculative Decoding via Decoupled MoE. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 10649–10668, San Diego, California, United States. Association for Computational Linguistics.