@inproceedings{lee-etal-2024-breaking,
title = "Breaking {R}e{LU} Barrier: Generalized {M}o{E}fication for Dense Pretrained Models",
author = "Lee, Jaeseong and
Hwang, Seung-won and
Park, Wonpyo and
Ji, Mingi",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.563/",
doi = "10.18653/v1/2024.emnlp-main.563",
pages = "10097--10107",
abstract = "As the scale of language models (LMs) continues to grow, there is a heightened interest in reducing the inference cost associated with these models. Mixture-of-Experts (MoEs) present an efficient alternative to dense models, while the existing methods to convert pretrained dense models to MoEs is limited to ReLU-based models with natural sparsity. This paper introduces G-MoEfication, applicable to arbitrary dense models, where ReLU-based activation sparsity assumptions no longer hold. For generalizations, we encounter the dilemma of needing to zero-out deactivated experts, while also avoiding excessive zeroing-out to retain dense activation information. We publicly release our code and report results conducted with mBERT, SantaCoder-1.1B, Phi-2-2.7B, and Falcon-7B demonstrating the efficacy of our approach in general scenarios: from multitask to multilingual, from fine-tuning to zero-shot evaluation."
}
Markdown (Informal)
[Breaking ReLU Barrier: Generalized MoEfication for Dense Pretrained Models](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.563/) (Lee et al., EMNLP 2024)
ACL