@inproceedings{dang-ngo-2026-selective,
title = "Selective Steering: Norm-Preserving Control Through Discriminative Layer Selection",
author = "Dang, Quy-Anh and
Ngo, Chris",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.529/",
pages = "10887--10910",
ISBN = "979-8-89176-395-1",
abstract = "Despite significant progress in alignment, large language models (LLMs) remain vulnerable to adversarial attacks that elicit harmful behaviors. Activation steering techniques offer a promising inference-time intervention approach, but existing methods suffer from critical limitations: activation addition requires careful coefficient tuning and is sensitive to layer-specific norm variations, while directional ablation provides only binary control. Recent work on Angular Steering introduces continuous control via rotation in a 2D subspace, but its practical implementation violates norm preservation, causing distribution shift and generation collapse, particularly in models below 7B parameters. We propose \textbf{Selective Steering}, which addresses these limitations through two key innovations: (1) a mathematically rigorous norm-preserving rotation formulation that maintains activation distribution integrity, and (2) discriminative layer selection that applies steering only where feature representations exhibit opposite-signed class alignment. Experiments across nine models demonstrate that Selective Steering achieves 5.5 higher attack success rates than prior methods while maintaining zero perplexity violations and approximately 100{\%} capability retention on standard benchmarks. Our approach provides a principled, efficient framework for controllable and stable LLM behavior modification."
}Markdown (Informal)
[Selective Steering: Norm-Preserving Control Through Discriminative Layer Selection](https://preview.aclanthology.org/ingest-acl-workshops/2026.findings-acl.529/) (Dang & Ngo, Findings 2026)
ACL