@inproceedings{kulkarni-etal-2025-agent,
title = "Agent vs. Agent: Automated Data Generation and Red-Teaming for Custom Agentic Workflows",
author = "Kulkarni, Ninad and
Wu, Xian and
Varia, Siddharth and
Bespalov, Dmitriy",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-industry.62/",
pages = "912--936",
ISBN = "979-8-89176-333-3",
abstract = "Large Language Models (LLMs) deployed as autonomous agents with tool access present unique safety challenges that extend beyond standalone model vulnerabilities. Existing red-teaming frameworks like AgentHarm use static prompts and hardcoded toolsets, limiting their applicability to custom production systems.We introduce a dual-component automated red-teaming framework: AgentHarm-Gen generates adversarial tasks and evaluation functions tailored to arbitrary toolsets, while Red-Agent-Reflect employs iterative prompt refinement with self-reflection to develop progressively more effective attacks.Evaluating across 115 harmful tasks (71 generated, 44 from AgentHarm) spanning 8 risk categories, our method achieves substantial improvements: up to 162{\%} increase in attack success rate on o4-mini and 86{\%} success on Gemini 2.5 Pro. Successful attacks systematically decompose adversarial objectives into benign-appearing sub-tasks that circumvent safety alignment, highlighting the need for agent-specific guardrails."
}Markdown (Informal)
[Agent vs. Agent: Automated Data Generation and Red-Teaming for Custom Agentic Workflows](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-industry.62/) (Kulkarni et al., EMNLP 2025)
ACL