@inproceedings{kawamura-etal-2026-protea,
title = "{PROTEA}: Offline Evaluation and Iterative Refinement for Multi-Agent {LLM} Workflows",
author = "Kawamura, Kazuki and
Waki, Satoshi and
Tateno, Kei",
editor = "Durrett, Greg and
Jian, Ping",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 3: System Demonstrations)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-demo.3/",
pages = "27--35",
ISBN = "979-8-89176-392-0",
abstract = "Multi-agent LLM workflows, which are AI systems composed of multiple role-specialized LLM calls, often outperform single prompts, but they are notoriously difficult to debug and refine. Failures can originate from subtle mistakes in intermediate artifacts that silently propagate downstream, forcing developers to read long traces and guess which agent to edit. We present PROTEA, a unified UI that closes the loop for offline, test-case{--}driven improvement of multi-agent workflows, enabling developers to efficiently diagnose and fix errors without manual inspection of long traces. PROTEA executes a workflow, scores intermediate artifacts with configurable evaluators, and overlays per-node states and rationales on the workflow graph to localize likely bottlenecks. To address the difficulty of preparing intermediate reference in complex systems, PROTEA performs backward node evaluation by inferring each node{'}s ideal expected output from terminal supervision and graph context, and comparing it with the observed node output. For a selected node, it proposes a targeted prompt patch as an editable diff, then automatically re-runs and re-evaluates the workflow to show before/after output diffs and score trajectories within the same interface. Using PROTEA, users can visually pinpoint system-wide bottlenecks at a glance, streamline remediation via semi-automated prompt patching, and immediately verify pre- and post-correction outcomes within a unified loop."
}Markdown (Informal)
[PROTEA: Offline Evaluation and Iterative Refinement for Multi-Agent LLM Workflows](https://preview.aclanthology.org/ingest-acl/2026.acl-demo.3/) (Kawamura et al., ACL 2026)
ACL