@inproceedings{xiang-etal-2025-alrphfs,
title = "{ALRPHFS}: Adversarially Learned Risk Patterns with Hierarchical Fast {\&} Slow Reasoning for Robust Agent Defense",
author = "Xiang, Shiyu and
Zhang, Tong and
Chen, Ronghao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1066/",
doi = "10.18653/v1/2025.findings-emnlp.1066",
pages = "19569--19587",
ISBN = "979-8-89176-335-7",
abstract = "LLM Agents are becoming central to intelligent systems. However, their deployment raises serious safety concerns. Existing defenses largely rely on ``Safety Checks'', which struggle to capture the complex semantic risks posed by harmful user inputs or unsafe agent behaviors{---}creating a significant semantic gap between safety checks and real-world risks. To bridge this gap, we propose a novel defense framework, ALRPHFS (Adversarially Learned Risk Patterns with Hierarchical Fast {\&} Slow Reasoning). ALRPHFS consists of two core components: (1) an offline adversarial self-learning loop to iteratively refine a generalizable and balanced library of risk patterns, substantially enhancing robustness without retraining the base LLM, and (2) an online hierarchical fast {\&} slow reasoning engine that balances detection effectiveness with computational efficiency. Experimental results demonstrate that our approach achieves superior overall performance compared to existing baselines, achieving a best{-}in{-}class average accuracy of 80{\%} and exhibiting strong generalizability across agents and tasks."
}Markdown (Informal)
[ALRPHFS: Adversarially Learned Risk Patterns with Hierarchical Fast & Slow Reasoning for Robust Agent Defense](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1066/) (Xiang et al., Findings 2025)
ACL