@inproceedings{ardoin-etal-2025-confabulation,
title = "Where Confabulation Lives: Latent Feature Discovery in {LLM}s",
author = "Ardoin, Thibaud and
Cai, Yi and
Wunder, Gerhard",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1515/",
pages = "29801--29825",
ISBN = "979-8-89176-332-6",
abstract = "Hallucination remains a critical failure mode of large language models (LLMs), undermining their trustworthiness in real-world applications. In this work, we focus on confabulation, a foundational aspect of hallucination where the model fabricates facts about unknown entities. We introduce a targeted dataset designed to isolate and analyze this behavior across diverse prompt types. Using this dataset, and building on recent progress in interpreting LLM internals, we extract latent directions associated with confabulation using sparse projections. A simple vector-based steering method demonstrates that these directions can modulate model behavior with minimal disruption, shedding light on the inner representations that drive factual and non-factual output. Our findings contribute to a deeper mechanistic understanding of LLMs and pave the way toward more trustworthy and controllable generation. We release the code and dataset at https://github.com/Thibaud-Ardoin/where-confabulation-lives."
}Markdown (Informal)
[Where Confabulation Lives: Latent Feature Discovery in LLMs](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1515/) (Ardoin et al., EMNLP 2025)
ACL