@inproceedings{cao-etal-2024-defending,
title = "Defending Against Alignment-Breaking Attacks via Robustly Aligned {LLM}",
author = "Cao, Bochuan and
Cao, Yuanpu and
Lin, Lu and
Chen, Jinghui",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.568/",
doi = "10.18653/v1/2024.acl-long.568",
pages = "10542--10560",
abstract = "Recently, Large Language Models (LLMs) have made significant advancements and are now widely used across various domains. Unfortunately, there has been a rising concern that LLMs can be misused to generate harmful or malicious content. Though a line of research has focused on aligning LLMs with human values and preventing them from producing inappropriate content, such alignments are usually vulnerable and can be bypassed by alignment-breaking attacks via adversarially optimized or handcrafted jailbreaking prompts. In this work, we introduce a Robustly Aligned LLM (RA-LLM) to defend against potential alignment-breaking attacks. RA-LLM can be directly constructed upon an existing aligned LLM with a robust alignment checking function, without requiring any expensive retraining or fine-tuning process of the original LLM. Furthermore, we also provide a theoretical analysis for RA-LLM to verify its effectiveness in defending against alignment-breaking attacks. Through real-world experiments on open-source large language models, we demonstrate that RA-LLM can successfully defend against both state-of-the-art adversarial prompts and popular handcrafted jailbreaking prompts by reducing their attack success rates from nearly 100{\%} to around 10{\%} or less."
}
Markdown (Informal)
[Defending Against Alignment-Breaking Attacks via Robustly Aligned LLM](https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.568/) (Cao et al., ACL 2024)
ACL