@inproceedings{han-etal-2026-experience,
title = "Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models",
author = "Han, Zhaoyang and
Liu, Yihe and
Zhang, Kai and
Li, Ping",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1188/",
pages = "23729--23747",
ISBN = "979-8-89176-395-1",
abstract = "The rapid discovery of jailbreak prompts has revealed the alarming fragility of safety alignment in frontier large language models (LLMs). While jailbreak techniques play a critical role in red-teaming and safety evaluation, existing methods exhibit three key limitations: (i) poor transferability across model families, requiring model-specific manual tuning; (ii) heavy reliance on large-scale prompt enumeration or exhaustive search, causing prohibitive query costs and poor scalability; and (iii) high sensitivity to input preprocessing or refusal-oriented fine-tuning, leading to attack failures once the underlying model is updated. To address these, we propose Experience-driven Multi-agent Jailbreak Optimization (EMJO), which couples three collaborating agents (Attacker, Analyzer, and Judge) into a closed-loop ``probe{--}evaluate{--}revise'' process, together with a dynamic experience bank accumulating high-quality successful prompts and reusable strategy patterns across iterations and tasks. This design enables query-efficient and transferable jailbreak optimization under black-box access. Extensive experiments on diverse LLMs demonstrate that EMJO consistently outperforms existing black-box jailbreak baselines, achieving up to 11{\%} absolute improvement in attack success rate while reducing the average query cost by up to 7.9$\times$ across two benchmark datasets. These results indicate that EMJO offers an effective and scalable paradigm for systematic jailbreak discovery."
}Markdown (Informal)
[Experience-Driven Multi-Agent Optimization for Black-Box Jailbreak Attacks on Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1188/) (Han et al., Findings 2026)
ACL