@inproceedings{brody-etal-2023-expressivity,
title = "On the Expressivity Role of {L}ayer{N}orm in Transformers' Attention",
author = "Brody, Shaked and
Alon, Uri and
Yahav, Eran",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2023",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-acl.895/",
doi = "10.18653/v1/2023.findings-acl.895",
pages = "14211--14221",
abstract = "Layer Normalization (LayerNorm) is an inherent component in all Transformer-based models. In this paper, we show that LayerNorm is crucial to the expressivity of the multi-head attention layer that follows it. This is in contrast to the common belief that LayerNorm`s only role is to normalize the activations during the forward pass, and their gradients during the backward pass. We consider a geometric interpretation of LayerNorm and show that it consists of two components: (a) projection of the input vectors to a d-1 space that is orthogonal to the [1,1,...,1] vector, and(b) scaling of all vectors to the same norm of d. We show that each of these components is important for the attention layer that follows it in Transformers:(a) projection allows the attention mechanism to create an attention query that attends to all keys equally, offloading the need to learn this operation in the attention; and(b) scaling allows each key to potentially receive the highest attention, and prevents keys from being {\textquotedblleft}un-select-able{\textquotedblright}.We show empirically that Transformers do indeed benefit from these properties of LayeNorm in general language modeling and even in computing simple functions such as {\textquotedblleft}majority{\textquotedblright}. Our code is available at \url{https://github.com/tech-srl/layer_norm_expressivity_role} ."
}
Markdown (Informal)
[On the Expressivity Role of LayerNorm in Transformers’ Attention](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.findings-acl.895/) (Brody et al., Findings 2023)
ACL