@inproceedings{zhou-etal-2026-siraj,
title = "{SIRAJ}: Diverse and Efficient Red-Teaming for {LLM} Agents via Distilled Structured Reasoning",
author = "Zhou, Kaiwen and
Elgohary, Ahmed and
Iftekhar, A S M and
Saied, Amin",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.171/",
pages = "3269--3292",
ISBN = "979-8-89176-386-9",
abstract = "The ability of LLM agents to plan and invoke tools exposes them to new safety risks, making a comprehensive red-teaming system crucial for discovering vulnerabilities and ensuring their safe deployment. We present SIRAJ, a generic red-teaming framework for arbitrary \textit{black-box} LLM agents. We employ a dynamic two-step process that starts with an agent definition and generates diverse seed test cases that cover diverse risk outcomes, tool-use trajectories, and risk sources. Then, it iteratively constructs and refines model-based adversarial attacks based on the execution trajectories of former attempts. To optimize the red-teaming cost, we present a model distillation approach that leverages structured forms of a teacher model{'}s reasoning to train smaller models that are equally effective. Across diverse evaluation agent settings, our seed test case generation approach yields 2 {--} 2.5x boost to the coverage of risk outcomes and tool-calling trajectories. Our distilled 8B red-teamer model improves attack success rate by 100{\%}, surpassing the 671B Deepseek-R1 model. Our ablations and analyses validate the effectiveness of the iterative framework, structured reasoning, and the generalization of our red-teamer models."
}Markdown (Informal)
[SIRAJ: Diverse and Efficient Red-Teaming for LLM Agents via Distilled Structured Reasoning](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.171/) (Zhou et al., Findings 2026)
ACL