@inproceedings{ramesh-etal-2024-gpt,
title = "{GPT}-4 Jailbreaks Itself with Near-Perfect Success Using Self-Explanation",
author = "Ramesh, Govind and
Dou, Yao and
Xu, Wei",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.1235/",
doi = "10.18653/v1/2024.emnlp-main.1235",
pages = "22139--22148",
abstract = "Research on jailbreaking has been valuable for testing and understanding the safety and security issues of large language models (LLMs). In this paper, we introduce Iterative Refinement Induced Self-Jailbreak (IRIS), a novel approach that leverages the reflective capabilities of LLMs for jailbreaking with only black-box access. Unlike previous methods, IRIS simplifies the jailbreaking process by using a single model as both the attacker and target. This method first iteratively refines adversarial prompts through self-explanation, which is crucial for ensuring that even well-aligned LLMs obey adversarial instructions. IRIS then rates and enhances the output given the refined prompt to increase its harmfulness. We find that IRIS achieves jailbreak success rates of 98{\%} on GPT-4, 92{\%} on GPT-4 Turbo, and 94{\%} on Llama-3.1-70B in under 7 queries. It significantly outperforms prior approaches in automatic, black-box, and interpretable jailbreaking, while requiring substantially fewer queries, thereby establishing a new standard for interpretable jailbreaking methods."
}
Markdown (Informal)
[GPT-4 Jailbreaks Itself with Near-Perfect Success Using Self-Explanation](https://preview.aclanthology.org/fix-sig-urls/2024.emnlp-main.1235/) (Ramesh et al., EMNLP 2024)
ACL