@inproceedings{liu-etal-2025-masks,
title = "Masks Can be Learned as an Alternative to Experts",
author = "Liu, Peiyu and
Wei, Tianwen and
Zhu, Bo and
Zhao, Xin and
Yan, Shuicheng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.768/",
pages = "15800--15811",
ISBN = "979-8-89176-251-0",
abstract = "In this work, we investigate how to sparsify a pre-trained dense large language model into a mixture-of-experts (MoE) architecture for faster inference. Our approach applies mask matrix to the activations for each expert, constrained by $L_0$ regularization to minimize the number of activated parameters. Starting with all parameters active, the model is progressively sparsified during training, ensuring minimal performance loss. This approach proves more efficient than one-shot sparsification techniques, which typically require significant resources for performance recovery. Moreover, our approach automatically identifies shared, token-specific, and inactive experts, allowing for more efficient allocation of computational resources. Through extensive experiments, we achieve up to 97{\%} performance retention on downstream tasks with only 50{\%} of the feed-forward parameters activated in dense models. Beyond enhancing inference efficiency, this strategy of sharing computational units among experts presents a valuable framework for designing more generalized and efficient MoE architectures, opening avenues for future advancements in expert-based models."
}
Markdown (Informal)
[Masks Can be Learned as an Alternative to Experts](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.768/) (Liu et al., ACL 2025)
ACL
- Peiyu Liu, Tianwen Wei, Bo Zhu, Xin Zhao, and Shuicheng Yan. 2025. Masks Can be Learned as an Alternative to Experts. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 15800–15811, Vienna, Austria. Association for Computational Linguistics.