@inproceedings{vig-2019-multiscale,
title = "A Multiscale Visualization of Attention in the Transformer Model",
author = "Vig, Jesse",
editor = "Costa-juss{\`a}, Marta R. and
Alfonseca, Enrique",
booktitle = "Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics: System Demonstrations",
month = jul,
year = "2019",
address = "Florence, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/P19-3007/",
doi = "10.18653/v1/P19-3007",
pages = "37--42",
abstract = "The Transformer is a sequence model that forgoes traditional recurrent architectures in favor of a fully attention-based approach. Besides improving performance, an advantage of using attention is that it can also help to interpret a model by showing how the model assigns weight to different input elements. However, the multi-layer, multi-head attention mechanism in the Transformer model can be difficult to decipher. To make the model more accessible, we introduce an open-source tool that visualizes attention at multiple scales, each of which provides a unique perspective on the attention mechanism. We demonstrate the tool on BERT and OpenAI GPT-2 and present three example use cases: detecting model bias, locating relevant attention heads, and linking neurons to model behavior."
}
Markdown (Informal)
[A Multiscale Visualization of Attention in the Transformer Model](https://preview.aclanthology.org/fix-sig-urls/P19-3007/) (Vig, ACL 2019)
ACL