@inproceedings{bassani-sanchez-2025-guardrail,
title = "On Guardrail Models' Robustness to Mutations and Adversarial Attacks",
author = "Bassani, Elias and
Sanchez, Ignacio",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-luhme/2025.findings-emnlp.922/",
doi = "10.18653/v1/2025.findings-emnlp.922",
pages = "16995--17006",
ISBN = "979-8-89176-335-7",
abstract = "The risk of generative AI systems providing unsafe information has raised significant concerns, emphasizing the need for safety guardrails. To mitigate this risk, guardrail models are increasingly used to detect unsafe content in human-AI interactions, complementing the safety alignment of Large Language Models. Despite recent efforts to evaluate those models' effectiveness, their robustness to input mutations and adversarial attacks remains largely unexplored. In this paper, we present a comprehensive evaluation of 15 state-of-the-art guardrail models, assessing their robustness to: a) input mutations, such as typos, keywords camouflage, ciphers, and veiled expressions, and b) adversarial attacks designed to bypass models' safety alignment. Those attacks exploit LLMs capabilities like instruction-following, role-playing, personification, reasoning, and coding, or introduce adversarial tokens to induce model misbehavior. Our results reveal that most guardrail models can be evaded with simple input mutations and are vulnerable to adversarial attacks. For instance, a single adversarial token can deceive them 44.5{\%} of the time on average. The limitations of the current generation of guardrail models highlight the need for more robust safety guardrails."
}Markdown (Informal)
[On Guardrail Models’ Robustness to Mutations and Adversarial Attacks](https://preview.aclanthology.org/ingest-luhme/2025.findings-emnlp.922/) (Bassani & Sanchez, Findings 2025)
ACL