@inproceedings{janiak-etal-2024-adversarial,
title = "An Adversarial Example for Direct Logit Attribution: Memory Management in {GELU}-4{L}",
author = "Janiak, Jett and
Rager, Can and
Dao, James and
Lau, Yeu-Tong",
editor = "Belinkov, Yonatan and
Kim, Najoung and
Jumelet, Jaap and
Mohebbi, Hosein and
Mueller, Aaron and
Chen, Hanjie",
booktitle = "Proceedings of the 7th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2024",
address = "Miami, Florida, US",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.blackboxnlp-1.15/",
doi = "10.18653/v1/2024.blackboxnlp-1.15",
pages = "232--237",
abstract = "Prior work suggests that language models manage the limited bandwidth of the residual stream through a {\textquotedblleft}memory management{\textquotedblright} mechanism, where certain attention heads and MLP layers clear residual stream directions set by earlier layers. Our study provides concrete evidence for this erasure phenomenon in a 4-layer transformer, identifying heads that consistently remove the output of earlier heads. We further demonstrate that direct logit attribution (DLA), a common technique for interpreting the output of intermediate transformer layers, can show misleading results by not accounting for erasure."
}
Markdown (Informal)
[An Adversarial Example for Direct Logit Attribution: Memory Management in GELU-4L](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.blackboxnlp-1.15/) (Janiak et al., BlackboxNLP 2024)
ACL