@inproceedings{li-etal-2026-toolprmbench,
title = "{T}ool{PRMB}ench: Evaluating and Advancing Process Reward Models for Tool-using Agents",
author = "Li, Dawei and
Yao, Yuguang and
Tan, Zhen and
Liu, Huan and
Guo, Ruocheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.602/",
pages = "12378--12391",
ISBN = "979-8-89176-395-1",
abstract = "Reward-guided search methods have demonstrated strong potential in enhancing tool-using agents by effectively guiding sampling and exploration over complex action spaces. As a core design, those search methods utilize process reward models (PRMs) to provide step-level rewards, enabling more fine-grained monitoring. However, there is a lack of systematic and reliable evaluation benchmarks for PRMs in tool-use settings. In this paper, we introduce ToolPRMBench, a large-scale benchmark specifically designed to evaluate PRMs for tool-using agents. ToolPRMBench is built on top of several representative tool-use benchmarks and converts agent trajectories into step-level test cases. Each case contains the interaction history, a correct action, a plausible but incorrect alternative, and relevant tool metadata. We respectively utilize offline sampling to isolate local single-step errors and online sampling to capture realistic multi-step failures from full agent rollouts. A multi-LLM verification pipeline is proposed to reduce label noise and ensure data quality. We conduct extensive experiments across large language models, general PRMs, and tool-specialized PRMs on ToolPRMBench. The results reveal clear differences in PRM effectiveness and highlight the potential of specialized PRMs for tool-using. Our code and dataset are available at: \url{https://github.com/David-Li0406/ToolPRMBench}[More resources on LLM-as-a-judge are on the website: {\ensuremath{<}}https://llm-as-a-judge.github.io{\ensuremath{>}}]."
}Markdown (Informal)
[ToolPRMBench: Evaluating and Advancing Process Reward Models for Tool-using Agents](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.602/) (Li et al., Findings 2026)
ACL