@inproceedings{zhong-etal-2026-activation,
title = "Activation Decomposition and Steering for {LLM} Backdoor Remediation",
author = "Zhong, Lingfeng and
Xu, Qiongkai and
Naseem, Usman",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.2025/",
pages = "43713--43737",
ISBN = "979-8-89176-390-6",
abstract = "Existing works on defending against LLM backdoor attacks rely on either auxiliary models or safety-related datasets for defending against backdoor attacks on large language models, which are not always available. To address these challenges, we propose our we propose our Contrastive-Selective Activation Decomposition and Steering (CS-ADS), which contrasts relatively more benign and poisoned settings to decompose the feature vectors for steering without relying on additional auxiliary models or datasets. With such disentangled vectors for remediation, our method can achieve feasible defense qualities even better than dataset-based contrastive steering strategies. This novel decomposition-based solution is motivated by the key insight that feature representations of prompt pairs can encode the same benign semantics in different proportions, even when both prompt pairs are similarly backdoored. Such discrepancies allow our method to identify effective remediation directions for steering the generation process, thereby preventing undesired outputs. We evaluate CS-ADS against multiple state-of-the-art backdoor attacks, and experimental results show that CS-ADS provides effective defense across settings."
}Markdown (Informal)
[Activation Decomposition and Steering for LLM Backdoor Remediation](https://preview.aclanthology.org/ingest-acl/2026.acl-long.2025/) (Zhong et al., ACL 2026)
ACL
- Lingfeng Zhong, Qiongkai Xu, and Usman Naseem. 2026. Activation Decomposition and Steering for LLM Backdoor Remediation. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 43713–43737, San Diego, California, United States. Association for Computational Linguistics.