@inproceedings{simhadri-2026-moves,
title = "What Moves the {P}areto Frontier in Tool-Using Agents? A Compute-Aware Study of {R}e{A}ct Variants",
author = "Simhadri, Rishi N.",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.82/",
pages = "923--940",
ISBN = "979-8-89176-393-7",
abstract = "Tool-using LLM agents are typically compared by accuracy alone, despite deployments being constrained by inference cost. We present a budgeted evaluation of common strategies for improving ReAct-style tool agents (multi-sample aggregation, iterative self-correction, and post-hoc answer revision) using Pareto analysis of cumulative accuracy versus token budget on three benchmarks (HotPotQA, FEVER, GSM8K) with Gemini 2.5 Flash. All experiments use three random seeds (N=500 per seed for HotPotQA/FEVER; N=1,015 for GSM8K); budgeted curves are computed post hoc from per-instance token logs. In our offline evaluation, Reflexion attains the highest accuracy on tool-heavy benchmarks (HotPotQA, FEVER), while CoT-SC leads on GSM8K. Reflexion{'}s reported token costs are optimistic lower bounds because retries are stopped using ground-truth feedback, and its accuracy is similarly optimistic: a deployment without access to ground-truth labels would not achieve the same accuracy because the gold-label stopping criterion would be unavailable; both costs and accuracy would differ in practice. Sampling-based approaches often spend 3-5x more tokens for comparatively small gains on tool-heavy tasks. GSM8K, a pure-math benchmark with minimal tool interaction, shows substantially larger gains for CoT-SC, TCAR, and Reflexion, larger than on tool-heavy benchmarks, though less sharply separated than headline accuracy alone would suggest, consistent with repeated tool trajectories being an important contributor to the observed efficiency gap in our tool-heavy settings. We provide a compute-aware evaluation protocol (frontier analysis and marginal-cost metrics) and practical guidance for choosing agent designs under different budget regimes."
}Markdown (Informal)
[What Moves the Pareto Frontier in Tool-Using Agents? A Compute-Aware Study of ReAct Variants](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.82/) (Simhadri, ACL 2026)
ACL