@inproceedings{kim-etal-2024-robust,
title = "Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield",
author = "Kim, Jinhwa and
Derakhshan, Ali and
Harris, Ian",
editor = {Chung, Yi-Ling and
Talat, Zeerak and
Nozza, Debora and
Plaza-del-Arco, Flor Miriam and
R{\"o}ttger, Paul and
Mostafazadeh Davani, Aida and
Calabrese, Agostina},
booktitle = "Proceedings of the 8th Workshop on Online Abuse and Harms (WOAH 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.woah-1.12/",
doi = "10.18653/v1/2024.woah-1.12",
pages = "159--170",
abstract = "Large Language Models' safety remains a critical concern due to their vulnerability to jailbreaking attacks, which can prompt these systems to produce harmful and malicious responses. Safety classifiers, computational models trained to discern and mitigate potentially harmful, offensive, or unethical outputs, offer a practical solution to address this issue. However, despite their potential, existing safety classifiers often fail when exposed to adversarial attacks such as gradient-optimized suffix attacks. In response, our study introduces Adversarial Prompt Shield (APS), a lightweight safety classifier model that excels in detection accuracy and demonstrates resilience against unseen jailbreaking prompts. We also introduce efficiently generated adversarial training datasets, named Bot Adversarial Noisy Dialogue (BAND), which are designed to fortify the classifier`s robustness. Through extensive testing on various safety tasks and unseen jailbreaking attacks, we demonstrate the effectiveness and resilience of our models. Evaluations show that our classifier has the potential to significantly reduce the Attack Success Rate by up to 44.9{\%}. This advance paves the way for the next generation of more reliable and resilient Large Language Models."
}
Markdown (Informal)
[Robust Safety Classifier Against Jailbreaking Attacks: Adversarial Prompt Shield](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.woah-1.12/) (Kim et al., WOAH 2024)
ACL