@inproceedings{aji-etal-2019-combining,
title = "Combining Global Sparse Gradients with Local Gradients in Distributed Neural Network Training",
author = "Aji, Alham Fikri and
Heafield, Kenneth and
Bogoychev, Nikolay",
editor = "Inui, Kentaro and
Jiang, Jing and
Ng, Vincent and
Wan, Xiaojun",
booktitle = "Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)",
month = nov,
year = "2019",
address = "Hong Kong, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/D19-1373/",
doi = "10.18653/v1/D19-1373",
pages = "3626--3631",
abstract = "One way to reduce network traffic in multi-node data-parallel stochastic gradient descent is to only exchange the largest gradients. However, doing so damages the gradient and degrades the model`s performance. Transformer models degrade dramatically while the impact on RNNs is smaller. We restore gradient quality by combining the compressed global gradient with the node`s locally computed uncompressed gradient. Neural machine translation experiments show that Transformer convergence is restored while RNNs converge faster. With our method, training on 4 nodes converges up to 1.5x as fast as with uncompressed gradients and scales 3.5x relative to single-node training."
}
Markdown (Informal)
[Combining Global Sparse Gradients with Local Gradients in Distributed Neural Network Training](https://preview.aclanthology.org/add-emnlp-2024-awards/D19-1373/) (Aji et al., EMNLP-IJCNLP 2019)
ACL