@inproceedings{wang-etal-2026-safetymem,
title = "{S}afety{M}em: Adaptive Jailbreak Defense via Dual-Component Safety Memory",
author = "Wang, Hao and
Ni, Ziyi and
Wang, Huacan and
Lyu, Pin and
Sha, Lei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1168/",
pages = "25486--25509",
ISBN = "979-8-89176-390-6",
abstract = "Current defenses for Large Language Models (LLMs) often suffer from a ``memory gap'': parameter-modifying methods are computationally rigid, while inference-time filters cannot retain or reuse defense knowledge across interactions. To address this, we propose SafetyMem, a novel framework that secures LLMs through a dual-component safety memory system. SafetyMem consists of Semantic Safety Memory (SSM), which consolidates diverse jailbreak attempts into a structured knowledge base of attack patterns, and Episodic Safety Memory (ESM), which maintains an evolving set of procedural rules refined from historical detection failures. Unlike static defenses, SafetyMem allows the model to ``remember'' and adapt to emerging adversarial strategies without parameter retraining. To further enhance robustness, we introduce an adversarial memory expansion mechanism that proactively generates challenging variants to solidify these memories. Experiments on standard and stealthy jailbreak benchmarks show that SafetyMem substantially reduces attack success rates while preserving efficiency and interpretability, consistently outperforming state-of-the-art baselines across multiple LLMs."
}Markdown (Informal)
[SafetyMem: Adaptive Jailbreak Defense via Dual-Component Safety Memory](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1168/) (Wang et al., ACL 2026)
ACL