@inproceedings{teplica-etal-2025-sciurus,
title = "{SCIUR}us: Shared Circuits for Interpretable Uncertainty Representations in Language Models",
author = "Teplica, Carter and
Liu, Yixin and
Cohan, Arman and
Rudner, Tim G. J.",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2025.naacl-long.618/",
pages = "12451--12469",
ISBN = "979-8-89176-189-6",
abstract = "We investigate the mechanistic sources of uncertainty in large language models (LLMs), an area with important implications for language model reliability and trustworthiness. To do so, we conduct a series of experiments designed to identify whether the factuality of generated responses and a model`s uncertainty originate in separate or shared circuits in the model architecture. We approach this question by adapting the well-established mechanistic interpretability techniques of causal tracing and zero-ablation to study the effect of different circuits on LLM generations. Our experiments on eight different models and five datasets, representing tasks predominantly requiring factual recall, provide strong evidence that a model`s uncertainty is produced in the same parts of the network that are responsible for the factuality of generated responses."
}
Markdown (Informal)
[SCIURus: Shared Circuits for Interpretable Uncertainty Representations in Language Models](https://preview.aclanthology.org/Author-page-Marten-During-lu/2025.naacl-long.618/) (Teplica et al., NAACL 2025)
ACL