@inproceedings{srivastav-zhang-2025-safe,
title = "Safe in Isolation, Dangerous Together: Agent-Driven Multi-Turn Decomposition Jailbreaks on {LLM}s",
author = "Srivastav, Devansh and
Zhang, Xiao",
editor = "Kamalloo, Ehsan and
Gontier, Nicolas and
Lu, Xing Han and
Dziri, Nouha and
Murty, Shikhar and
Lacoste, Alexandre",
booktitle = "Proceedings of the 1st Workshop for Research on Agent Language Models (REALM 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/mtsummit-25-ingestion/2025.realm-1.13/",
doi = "10.18653/v1/2025.realm-1.13",
pages = "170--183",
ISBN = "979-8-89176-264-0",
abstract = "Large Language Models (LLMs) are increasingly deployed in critical domains, but their vulnerability to jailbreak attacks remains a significant concern. In this paper, we propose a multi-agent, multi-turn jailbreak strategy that systematically bypasses LLM safety mechanisms by decomposing harmful queries into seemingly benign sub-tasks. Built upon a role-based agentic framework consisting of a Question Decomposer, a Sub-Question Answerer, and an Answer Combiner, we demonstrate how LLMs can be manipulated to generate prohibited content without prompt manipulations. Our results show a drastic increase in attack success, often exceeding 90{\%} across various LLMs, including GPT-3.5-Turbo, Gemma-2-9B, and Mistral-7B. We further analyze attack consistency across multiple runs and vulnerability across content categories. Compared to existing widely used jailbreak techniques, our multi-agent method consistently achieves the highest attack success rate across all evaluated models. These findings reveal a critical flaw in the current safety architecture of multi-agent LLM systems: their lack of holistic context awareness. By revealing this weakness, we argue for an urgent need to develop multi-turn, context-aware, and robust defenses to address this emerging threat vector."
}
Markdown (Informal)
[Safe in Isolation, Dangerous Together: Agent-Driven Multi-Turn Decomposition Jailbreaks on LLMs](https://preview.aclanthology.org/mtsummit-25-ingestion/2025.realm-1.13/) (Srivastav & Zhang, REALM 2025)
ACL