@inproceedings{bugaud-2026-blind,
title = "Blind Single-Layer Activation Edits Show a Break/Fix Asymmetry in Factual Recall",
author = "Bugaud, Zacharie",
editor = "Chen, Canyu and
Zhang, Yuji and
Li, Zoey Sha and
Wang, Zihan and
Wang, Qineng and
Su, Jinyan and
Kargupta, Priyanka and
Marjanovi{\'c}, Sara Vera and
Pan, Jeff Z. and
Bansal, Mohit and
Augenstein, Isabelle and
Han, Jiawei and
Ji, Heng and
Li, Manling",
booktitle = "Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models ({K}now{FM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.knowfm-1.2/",
pages = "13--24",
ISBN = "979-8-89176-403-3",
abstract = "Can factual errors in language models be repaired by editing a single hidden activation at inference time?We compare blind edits, which are not told the correct answer, with oracle edits that receive answer-specific information.On Pythia-6.9B, with corruption replicated on Pythia-1B and GPT-2 XL, we find a strong break/fix asymmetry: single-layer perturbations easily corrupt correct factual recall, flipping 74-100{\%} of initially correct answers, but blind repair is much harder.On EntityConfusion, twelve blind non-gradient interventions from four families fail to repair stable hallucinations in the strict single-layer setting; relaxed multi-layer or multi-head variants improve net accuracy by only $+3$ percentage points.Blind gradient optimization repairs more errors, but often breaks already-correct answers.In contrast, oracle edits given the correct answer repair many more hallucinations, fixing 68{\%} at the default layer and up to 82{\%} at a better layer.These results suggest that the main barrier is not whether factual recall can be steered, but whether a blind method can identify the right target-specific direction.TriviaQA is a boundary case: blind confidence maximization outperforms the single-token oracle, but the comparison is complicated because evaluation accepts multiple aliases."
}Markdown (Informal)
[Blind Single-Layer Activation Edits Show a Break/Fix Asymmetry in Factual Recall](https://preview.aclanthology.org/ingest-acl-workshops/2026.knowfm-1.2/) (Bugaud, KnowFM 2026)
ACL