@inproceedings{kim-cho-2025-break,
title = "Break the Breakout: Reinventing {LM} Defense Against Jailbreak Attacks with Self-Refine",
author = "Kim, Heegyu and
Cho, Hyunsouk",
editor = "Cao, Trista and
Das, Anubrata and
Kumarage, Tharindu and
Wan, Yixin and
Krishna, Satyapriya and
Mehrabi, Ninareh and
Dhamala, Jwala and
Ramakrishna, Anil and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul and
Chang, Kai-Wei",
booktitle = "Proceedings of the 5th Workshop on Trustworthy NLP (TrustNLP 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.trustnlp-main.7/",
pages = "82--102",
ISBN = "979-8-89176-233-6",
abstract = "Language models (LMs) are vulnerable to exploitation for adversarial misuse. Training LMs for safety alignment is extensive, making it hard to respond to fast-developing attacks immediately, such as jailbreaks. We propose self-refine with formatting that achieves outstanding safety even in non-safety-aligned LMsand evaluate our method alongside several defense baselines, demonstrating that it is the safest training-free method against jailbreak attacks.Additionally, we proposed a formatting method that improves the efficiency of the self-refine process while reducing attack success rates in fewer iterations. We observed that non-safety-aligned LMs outperform safety-aligned LMs in safety tasks by giving more helpful and safe responses.In conclusion, our findings can achieve less safety risk with fewer computational costs, allowing non-safety LM to be efficiently utilized in real-world service."
}
Markdown (Informal)
[Break the Breakout: Reinventing LM Defense Against Jailbreak Attacks with Self-Refine](https://preview.aclanthology.org/fix-sig-urls/2025.trustnlp-main.7/) (Kim & Cho, TrustNLP 2025)
ACL