@inproceedings{nanda-etal-2023-emergent,
title = "Emergent Linear Representations in World Models of Self-Supervised Sequence Models",
author = "Nanda, Neel and
Lee, Andrew and
Wattenberg, Martin",
editor = "Belinkov, Yonatan and
Hao, Sophie and
Jumelet, Jaap and
Kim, Najoung and
McCarthy, Arya and
Mohebbi, Hosein",
booktitle = "Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add_missing_videos/2023.blackboxnlp-1.2/",
doi = "10.18653/v1/2023.blackboxnlp-1.2",
pages = "16--30",
abstract = "How do sequence models represent their decision-making process? Prior work suggests that Othello-playing neural network learned nonlinear models of the board state (Li et al., 2023a). In this work, we provide evidence of a closely related linear representation of the board. In particular, we show that probing for {\textquotedblleft}my colour{\textquotedblright} vs. {\textquotedblleft}opponent`s colour{\textquotedblright} may be a simple yet powerful way to interpret the model`s internal state. This precise understanding of the internal representations allows us to control the model`s behaviour with simple vector arithmetic. Linear representations enable significant interpretability progress, which we demonstrate with further exploration of how the world model is computed."
}
Markdown (Informal)
[Emergent Linear Representations in World Models of Self-Supervised Sequence Models](https://preview.aclanthology.org/add_missing_videos/2023.blackboxnlp-1.2/) (Nanda et al., BlackboxNLP 2023)
ACL