@inproceedings{brix-etal-2020-successfully,
title = "Successfully Applying the Stabilized Lottery Ticket Hypothesis to the Transformer Architecture",
author = "Brix, Christopher and
Bahar, Parnia and
Ney, Hermann",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.360/",
doi = "10.18653/v1/2020.acl-main.360",
pages = "3909--3915",
abstract = "Sparse models require less memory for storage and enable a faster inference by reducing the necessary number of FLOPs. This is relevant both for time-critical and on-device computations using neural networks. The stabilized lottery ticket hypothesis states that networks can be pruned after none or few training iterations, using a mask computed based on the unpruned converged model. On the transformer architecture and the WMT 2014 English-to-German and English-to-French tasks, we show that stabilized lottery ticket pruning performs similar to magnitude pruning for sparsity levels of up to 85{\%}, and propose a new combination of pruning techniques that outperforms all other techniques for even higher levels of sparsity. Furthermore, we confirm that the parameter{'}s initial sign and not its specific value is the primary factor for successful training, and show that magnitude pruning cannot be used to find winning lottery tickets."
}
Markdown (Informal)
[Successfully Applying the Stabilized Lottery Ticket Hypothesis to the Transformer Architecture](https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.360/) (Brix et al., ACL 2020)
ACL