@inproceedings{takase-etal-2023-b2t,
title = "{B}2{T} Connection: Serving Stability and Performance in Deep Transformers",
author = "Takase, Sho and
Kiyono, Shun and
Kobayashi, Sosuke and
Suzuki, Jun",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-acl.192/",
doi = "10.18653/v1/2023.findings-acl.192",
pages = "3078--3095",
abstract = "In the perspective of a layer normalization (LN) position, the architecture of Transformers can be categorized into two types: Post-LN and Pre-LN.Recent Transformers prefer to select Pre-LN because the training in Post-LN with deep Transformers, e.g., ten or more layers, often becomes unstable, resulting in useless models. However, in contrast, Post-LN has also consistently achieved better performance than Pre-LN in relatively shallow Transformers, e.g., six or fewer layers. This study first investigates the reason for these discrepant observations empirically and theoretically and discovers 1, the LN in Post-LN is the source of the vanishing gradient problem that mainly leads the unstable training whereas Pre-LN prevents it, and 2, Post-LN tends to preserve larger gradient norms in higher layers during the back-propagation that may lead an effective training. Exploiting the new findings, we propose a method that can equip both higher stability and effective training by a simple modification from Post-LN.We conduct experiments on a wide range of text generation tasks and demonstrate that our method outperforms Pre-LN, and stable training regardless of the shallow or deep layer settings."
}
Markdown (Informal)
[B2T Connection: Serving Stability and Performance in Deep Transformers](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-acl.192/) (Takase et al., Findings 2023)
ACL