@inproceedings{du-etal-2025-fine,
title = "Fine-Grained Manipulation of Arithmetic Neurons",
author = "Du, Wenyu and
Zheng, Rui and
Luo, Tongxu and
Chung, Stephen and
Fu, Jie",
editor = "Belinkov, Yonatan and
Mueller, Aaron and
Kim, Najoung and
Mohebbi, Hosein and
Chen, Hanjie and
Arad, Dana and
Sarti, Gabriele",
booktitle = "Proceedings of the 8th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.blackboxnlp-1.27/",
pages = "467--479",
ISBN = "979-8-89176-346-3",
abstract = "It is a longstanding challenge to understand how neural models perform mathematical reasoning. Recent mechanistic interpretability work indicates that large language models (LLMs) use a ``bag of heuristics'' in middle to late-layer MLP neurons for arithmetic, where each heuristic promotes logits for specific numerical patterns. Building on this, we aim for fine-grained manipulation of these heuristic neurons to causally steer model predictions towards specific arithmetic outcomes, moving beyond simply disrupting accuracy. This paper presents a methodology that enables the systematic identification and causal manipulation of heuristic neurons, which is applied to the addition task in this study. We train a linear classifier to predict heuristics based on activation values, achieving over 90{\%} classification accuracy. The trained classifier also allows us to rank neurons by their importance to a given heuristic. By targeting a small set of top-ranked neurons (K=50), we demonstrate high success rates{---}over 80{\%} for the ones place and nearly 70{\%} for the tens place{---}in controlling addition outcomes. This manipulation is achieved by transforming the activation of identified neurons into specific target heuristics by zeroing out source-heuristic neurons and adjusting target-heuristic neurons towards their class activation centroids. We explain these results by hypothesizing that high-ranking neurons possess `cleaner channels' for their heuristics, supported by Signal-to-Noise Ratio (SNR) analysis where these neurons show higher SNR scores. Our work offers a robust approach to dissect, causally test, and precisely influence LLM arithmetic, advancing understanding of their internal mechanisms."
}Markdown (Informal)
[Fine-Grained Manipulation of Arithmetic Neurons](https://preview.aclanthology.org/ingest-emnlp/2025.blackboxnlp-1.27/) (Du et al., BlackboxNLP 2025)
ACL
- Wenyu Du, Rui Zheng, Tongxu Luo, Stephen Chung, and Jie Fu. 2025. Fine-Grained Manipulation of Arithmetic Neurons. In Proceedings of the 8th BlackboxNLP Workshop: Analyzing and Interpreting Neural Networks for NLP, pages 467–479, Suzhou, China. Association for Computational Linguistics.