@inproceedings{maeng-etal-2025-goal,
title = "Goal-Conditioned {DPO}: Prioritizing Safety in Misaligned Instructions",
author = "Maeng, Joo Bon and
Lee, Seongmin and
Seo, Seokin and
Kim, Kee-Eung",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.naacl-long.369/",
pages = "7196--7211",
ISBN = "979-8-89176-189-6",
abstract = "Large language models (LLMs) undergo extensive safety training to maximize both helpfulness and harmlessness in their responses. However, various jailbreak attacks jeopardize model safety, allowing malicious actors to bypass safety guidelines. Existing defense methods primarily focus on aligning the model{'}s output towards less harmful responses through post-processing or input perturbation. Consequently, these approaches are prone to general performance degradation and lack the ability to defend against a wide variety of attacks. In this paper, we propose goal-conditioned direct preference optimization (GC-DPO), which is trained to prioritize the system prompt over the user prompt through goal-conditioning, and thus enables a good balance between safety and performance. Empirically, we show that our approach significantly reduces the average Attack Success Rate (ASR) on a wide variety of jailbreak attacks. In particular, GC-DPO achieves a reduction of 67.1{\%} to 5.0{\%} in ASR for Vicuna-7B, a state-of-the-art result, without compromising the model{'}s general performance."
}
Markdown (Informal)
[Goal-Conditioned DPO: Prioritizing Safety in Misaligned Instructions](https://preview.aclanthology.org/landing_page/2025.naacl-long.369/) (Maeng et al., NAACL 2025)
ACL
- Joo Bon Maeng, Seongmin Lee, Seokin Seo, and Kee-Eung Kim. 2025. Goal-Conditioned DPO: Prioritizing Safety in Misaligned Instructions. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7196–7211, Albuquerque, New Mexico. Association for Computational Linguistics.