@inproceedings{yona-etal-2026-context,
title = "In-Context Representation Hijacking",
author = "Yona, Itay and
Sarid, Amir and
Karasik, Michael and
Gandelsman, Yossi",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.768/",
pages = "16852--16867",
ISBN = "979-8-89176-390-6",
abstract = "We introduce **Doublespeak**, a simple in-context representation hijacking attack against language models. The attack works by systematically replacing a harmful keyword (e.g., *bomb*) with a benign token (e.g., *carrot*) across multiple in-context examples, provided as a prefix to a harmful request. We demonstrate that this substitution leads to the internal representation of the benign token converging toward that of the harmful one, effectively embedding the harmful semantics under a euphemism. As a result, superficially innocuous prompts (e.g., *{''}How to build a carrot?''*) are internally interpreted as disallowed instructions (*{''}How to build a bomb?''*), thereby bypassing the model{'}s safety alignment. We use interpretability tools to show this semantic shift occurs progressively across layers. Doublespeak is optimization-free, broadly transferable across model families, and achieves strong success rates on closed-source systems, reaching 74{\%} on Llama-3.3-70B-Instruct with a single-sentence context override. Our findings highlight a new attack surface in LM latent space, indicating that current alignment strategies are insufficient and should instead operate at the representation level."
}Markdown (Informal)
[In-Context Representation Hijacking](https://preview.aclanthology.org/ingest-acl/2026.acl-long.768/) (Yona et al., ACL 2026)
ACL
- Itay Yona, Amir Sarid, Michael Karasik, and Yossi Gandelsman. 2026. In-Context Representation Hijacking. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 16852–16867, San Diego, California, United States. Association for Computational Linguistics.