@inproceedings{zhang-etal-2025-accelerating,
title = "Accelerating Dense {LLM}s via L0-regularized Mixture-of-Experts",
author = "Zhang, Zhenyu and
Yang, Jiudong and
Tao, Zhaowen and
Chen, Meng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/dashboard/2025.acl-short.39/",
doi = "10.18653/v1/2025.acl-short.39",
pages = "504--513",
ISBN = "979-8-89176-252-7",
abstract = "Large language models (LLMs) achieve strong performance but suffer from slow and costly inference. Existing acceleration methods often lead to noticeable performance degradation, while Mixture-of-Experts (MoE) models require extensive computational resources. In this paper, we propose L0-MoE, a lightweight MoE approach using L0-regularization to accelerate dense LLMs nearly without performance loss. Our method introduces a cluster confusion matrix for domain-aware dataset curation and applies dynamic batching for efficient training. Experiments show that L0-MoE achieves up to 2.5x speedup over dense models while maintaining competitive performance, outperforming existing LLM acceleration baselines."
}Markdown (Informal)
[Accelerating Dense LLMs via L0-regularized Mixture-of-Experts](https://preview.aclanthology.org/dashboard/2025.acl-short.39/) (Zhang et al., ACL 2025)
ACL
- Zhenyu Zhang, Jiudong Yang, Zhaowen Tao, and Meng Chen. 2025. Accelerating Dense LLMs via L0-regularized Mixture-of-Experts. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers), pages 504–513, Vienna, Austria. Association for Computational Linguistics.