@inproceedings{cheng-etal-2026-mechanistic,
title = "A Mechanistic Perspective and Difficulty Metric for Unlearning",
author = "Cheng, Jiali and
Chen, Ziheng and
Agarwal, Chirag and
Amiri, Hadi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.532/",
pages = "10950--10964",
ISBN = "979-8-89176-395-1",
abstract = "Machine unlearning is becoming essential for building trustworthy and compliant language models. Yet unlearning success varies considerably across individual samples: some are reliably erased, while others persist despite the same procedure. We argue that this disparity is not only a data-side phenomenon, but also reflects model-internal mechanisms that encode and protect memorized information. We study this problem from a mechanistic perspective based on model circuits{--}structured interaction pathways that govern how predictions are formed. We propose Circuit-guided Unlearning Difficulty (), a \textit{}pre-unlearning metric that assigns each sample a continuous difficulty score using circuit-level signals. Extensive experiments demonstrate that reliably separates intrinsically easy and hard samples, and remains stable across unlearning methods. We identify key circuit-level patterns that reveal a mechanistic signature of unlearning difficulty: easy-to-unlearn samples are associated with shorter, shallower interactions concentrated in earlier-to-intermediate parts of the original model, whereas hard-to-unlearn samples rely on longer and deeper pathways closer to late-stage computation. Compared to existing qualitative studies, takes a first step toward a principled, fine-grained, and interpretable analysis of unlearning difficulty; and motivates the development of unlearning methods grounded in model mechanisms."
}Markdown (Informal)
[A Mechanistic Perspective and Difficulty Metric for Unlearning](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.532/) (Cheng et al., Findings 2026)
ACL