@inproceedings{kim-etal-2022-says,
title = "Who Says Elephants Can{'}t Run: Bringing Large Scale {M}o{E} Models into Cloud Scale Production",
author = "Kim, Young Jin and
Henry, Rawn and
Fahim, Raffy and
Hassan, Hany",
editor = {Fan, Angela and
Gurevych, Iryna and
Hou, Yufang and
Kozareva, Zornitsa and
Luccioni, Sasha and
Sadat Moosavi, Nafise and
Ravi, Sujith and
Kim, Gyuwan and
Schwartz, Roy and
R{\"u}ckl{\'e}, Andreas},
booktitle = "Proceedings of the Third Workshop on Simple and Efficient Natural Language Processing (SustaiNLP)",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates (Hybrid)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.sustainlp-1.6/",
doi = "10.18653/v1/2022.sustainlp-1.6",
pages = "36--43",
abstract = "Mixture of Experts (MoE) models with conditional execution of sparsely activated layers has enabled training models with a much larger number of parameters. As a result, these models have achieved significantly better quality on various natural language processing tasks including machine translation. However, it remains challenging to deploy such models in real-life scenarios due to the large memory requirements and inefficient inference. In this work, we introduce a highly efficient inference framework with several optimization approaches to accelerate the computation of sparse models and cut down the memory consumption significantly. While we achieve up to 26x speed-up in terms of throughput, we also reduce the model size almost to one eighth of the original 32-bit float model by quantizing expert weights into 4-bit integers. As a result, we are able to deploy 136x larger models with 27{\%} less cost and significantly better quality with large scale MoE model deployment compared to the existing solutions. This enables a paradigm shift in deploying large scale multilingual MoE transformers models instead of distilling into dozens of smaller models per language or task."
}
Markdown (Informal)
[Who Says Elephants Can’t Run: Bringing Large Scale MoE Models into Cloud Scale Production](https://preview.aclanthology.org/fix-sig-urls/2022.sustainlp-1.6/) (Kim et al., sustainlp 2022)
ACL