@inproceedings{alon-etal-2026-faithful,
title = "Faithful Serum: Mitigating the Faithfulness Gap in Textual Explanations of {LLM} Decisions via Attribution Guidance",
author = "Alon, Bar and
Zimerman, Itamar and
Wolf, Lior",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.300/",
pages = "6622--6645",
ISBN = "979-8-89176-390-6",
abstract = "Large language models (LLMs) achieve strong performance and have revolutionized NLP, but their lack of explainability keeps them treated as black boxes, limiting their use in domains that demand transparency and trust. A promising direction to address this issue is *post-hoc* text-based explanations, which aim to explain model decisions in natural language. Prior work has focused on generating convincing rationales that appear to be subjectively faithful, but it remains unclear whether these explanations are epistemically faithful - that is, whether they reflect the internal evidence the model actually relied on for its decision. In this paper, we first assess the **epistemic faithfulness** of LLM-generated explanations *via counterfactuals* and show that they are often unfaithful. We then introduce a **training-free method**, that enhances faithfulness by guiding explanation generation through attention-level interventions, informed by token-level heatmaps extracted via a faithful attribution method. This method significantly improves epistemic faithfulness across multiple models, benchmarks, and prompts. Our code is attached as supplementary material."
}Markdown (Informal)
[Faithful Serum: Mitigating the Faithfulness Gap in Textual Explanations of LLM Decisions via Attribution Guidance](https://preview.aclanthology.org/ingest-acl/2026.acl-long.300/) (Alon et al., ACL 2026)
ACL