@inproceedings{bugaud-2026-single,
title = "Single-Layer Activation Edits Easily Corrupt Factual Recall but Rarely Repair It",
author = "Bugaud, Zacharie",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.38/",
pages = "515--527",
ISBN = "979-8-89176-418-7",
abstract = "Single-layer activation edits easily corrupt a language model{'}s correct factual answers but rarely repair its errors. On a curated factual-recall benchmark, corruption flips 70{--}100{\%} of correct answers across three models, while twelve blind methods (no access to the correct answer) fix at most 6{\%} within every evaluation pool. Per-instance gradient optimization ostensibly fixes 39{\%}, but norm-constrained analysis reveals a magnitude artifact: at oracle-matched norms the fix rate drops to random, directions are nearly orthogonal to oracle directions (cos = -0.04), and collateral damage makes the net effect negative. An oracle ablation controlling for budget, target identity, and directional noise points to a direction-selection bottleneck: repair requires a precise, per-question direction that blind methods cannot locate. Target-informed methods partially succeed but none generalizes to unseen distributions."
}Markdown (Informal)
[Single-Layer Activation Edits Easily Corrupt Factual Recall but Rarely Repair It](https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.38/) (Bugaud, TrustNLP 2026)
ACL