@inproceedings{liu-etal-2024-branchnorm,
title = "{B}ranch{N}orm: Robustly Scaling Extremely Deep Transformers",
author = "Liu, Yijin and
Zeng, Xianfeng and
Meng, Fandong and
Zhou, Jie",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.695/",
doi = "10.18653/v1/2024.findings-acl.695",
pages = "11675--11687",
abstract = "Recently, DeepNorm scales Transformers into extremely deep (i.e., 1000 layers) and reveals the promising potential of deep scaling. To stabilize the training of deep models, DeepNorm attempts to constrain the model update to a constant value. Although applying such a constraint can benefit the early stage of model training, it may lead to undertrained models during the whole training procedure. In this paper, we propose BranchNorm, which dynamically rescales the non-residual branch of Transformer in accordance with the training period. BranchNorm not only theoretically stabilizes the training with smooth gradient norms at the early stage, but also encourages better convergence in the subsequent training stage. Experimental results on multiple translation tasks demonstrate that BranchNorm achieves a better trade-off between training stability and converge performance."
}
Markdown (Informal)
[BranchNorm: Robustly Scaling Extremely Deep Transformers](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.695/) (Liu et al., Findings 2024)
ACL