@inproceedings{nakash-etal-2026-efficient,
title = "Efficient Agent Evaluation via Diversity-Guided User Simulation",
author = "Nakash, Itay and
Kour, George and
Tavor, Ateret Anaby",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-industry.112/",
pages = "1627--1648",
ISBN = "979-8-89176-394-4",
abstract = "Large language models (LLMs) are increasingly deployed as customer-facing agents, yet evaluating their reliability remains challenging due to stochastic, multi-turn interactions. Current evaluation protocols rely on linear Monte Carlo rollouts of full agent-user conversations to estimate success. This approach is computationally inefficient - reprocessing identical conversation prefixes across runs, and often fails to uncover deep failure modes triggered by rare user behaviors.We introduce DIVERT (Diversity-Induced Evaluation via Branching of Trajectories), a snapshot-based, coverage-guided user simulation framework for efficient and systematic exploration of multi-turn agent behavior. DIVERT captures the full agent{--}environment state at critical junctions and resumes execution from these points, reusing shared prefixes to avoid redundant regeneration and reduce token cost. From each junction, it branches with targeted, diverse user responses, enabling directed exploration of alternative interaction paths while preserving task intent.By reallocating computation from redundant restarts to behaviorally salient mid-trajectory states, DIVERT steers evaluation toward under-explored semantic regions and rare interaction failures. Experiments on realistic multi-domain benchmarks show that our method consistently improves failure discovery efficiency and task-level coverage compared to standard linear rollout evaluation, without increasing overall cost."
}Markdown (Informal)
[Efficient Agent Evaluation via Diversity-Guided User Simulation](https://preview.aclanthology.org/ingest-acl/2026.acl-industry.112/) (Nakash et al., ACL 2026)
ACL