@inproceedings{ball-etal-2026-understanding,
title = "Understanding Jailbreak Success: A Study of Latent Space Dynamics in Large Language Models",
author = "Ball, Sarah and
Kreuter, Frauke and
Panickssery, Nina",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.12/",
pages = "250--279",
ISBN = "979-8-89176-380-7",
abstract = "Conversational large language models are trained to refuse to answer harmful questions. However, emergent jailbreaking techniques can still elicit unsafe outputs, presenting an ongoing challenge for model alignment. This paper aims to deepen our understanding of how different jailbreak types circumvent safeguards by analyzing model activations on different jailbreak inputs. We find that it is possible to extract a jailbreak vector from a single class of jailbreaks that works to mitigate jailbreak effectiveness from other, semantically-dissimilar classes. This suggests that diverse jailbreaks may exploit a common internal mechanism. We investigate a potential common mechanism of harmfulness feature suppression, and find evidence that effective jailbreaks noticeably reduce a model{'}s perception of prompt harmfulness. These insights pave the way for developing more robust jailbreak countermeasures and lay the groundwork for a deeper, mechanistic understanding of jailbreak dynamics in language models."
}Markdown (Informal)
[Understanding Jailbreak Success: A Study of Latent Space Dynamics in Large Language Models](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.12/) (Ball et al., EACL 2026)
ACL