@inproceedings{csordas-etal-2023-approximating,
title = "Approximating Two-Layer Feedforward Networks for Efficient Transformers",
author = {Csord{\'a}s, R{\'o}bert and
Irie, Kazuki and
Schmidhuber, J{\"u}rgen},
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.49/",
doi = "10.18653/v1/2023.findings-emnlp.49",
pages = "674--692",
abstract = "How to reduce compute and memory requirements of neural networks (NNs) without sacrificing performance? Many recent works use sparse Mixtures of Experts (MoEs) to build resource-efficient large language models (LMs). Here we introduce several novel perspectives on MoEs, presenting a general framework that *unifies* various methods to *approximate two-layer NNs* (e.g., feedforward blocks of Transformers), including product-key memories (PKMs). Leveraging insights from this framework, we propose methods to improve both MoEs and PKMs. Unlike prior work that compares MoEs with dense baselines under the *compute-equal* condition, our evaluation condition is *parameter-equal*, which is crucial to properly evaluate LMs. We show that our MoEs are competitive with the *dense* Transformer-XL on both the WikiText-103 and enwiki8 datasets at two different scales, while being much more resource efficient. This demonstrates that MoEs are relevant not only to extremely large LMs but also to any-scale resource-efficient LMs. Our code is public."
}
Markdown (Informal)
[Approximating Two-Layer Feedforward Networks for Efficient Transformers](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.49/) (Csordás et al., Findings 2023)
ACL