@inproceedings{hegazy-etal-2026-guiding,
title = "Guiding Giants: Lightweight Controllers for Weighted Activation Steering in {LLM}s",
author = "Hegazy, Amr and
Elhoushi, Mostafa and
Alanwar, Amr",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.46/",
pages = "584--599",
ISBN = "979-8-89176-418-7",
abstract = "Controlling undesirable LLM behaviors typically requires costly fine-tuning, while existing inference-time steering methods lack fine-grained adaptivity. We introduce a lightweight, trainable controller network for adaptive inference-time control. The controller observes intermediate LLM activations to predict a global scaling factor and layer-specific weights, which dynamically modulate a pre-computed ``refusal direction'' vector. Trained on harmful and benign prompts, the controller learns to apply nuanced, layer-aware steering selectively. Experiments on Llama and Mistral models show our method significantly increases refusal rates on safety benchmarks like ToxicChat, outperforming existing approaches without altering the original model parameters."
}Markdown (Informal)
[Guiding Giants: Lightweight Controllers for Weighted Activation Steering in LLMs](https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.46/) (Hegazy et al., TrustNLP 2026)
ACL