@inproceedings{nanda-etal-2023-emergent,
title = "Emergent Linear Representations in World Models of Self-Supervised Sequence Models",
author = "Nanda, Neel and
Lee, Andrew and
Wattenberg, Martin",
editor = "Belinkov, Yonatan and
Hao, Sophie and
Jumelet, Jaap and
Kim, Najoung and
McCarthy, Arya and
Mohebbi, Hosein",
booktitle = "Proceedings of the 6th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.blackboxnlp-1.2",
doi = "10.18653/v1/2023.blackboxnlp-1.2",
pages = "16--30",
abstract = "How do sequence models represent their decision-making process? Prior work suggests that Othello-playing neural network learned nonlinear models of the board state (Li et al., 2023a). In this work, we provide evidence of a closely related linear representation of the board. In particular, we show that probing for {``}my colour{''} vs. {``}opponent{'}s colour{''} may be a simple yet powerful way to interpret the model{'}s internal state. This precise understanding of the internal representations allows us to control the model{'}s behaviour with simple vector arithmetic. Linear representations enable significant interpretability progress, which we demonstrate with further exploration of how the world model is computed.",
}
Markdown (Informal)
[Emergent Linear Representations in World Models of Self-Supervised Sequence Models](https://aclanthology.org/2023.blackboxnlp-1.2) (Nanda et al., BlackboxNLP-WS 2023)
ACL