@inproceedings{dong-etal-2026-etom,
title = "{ETOM}: A Five-Level Benchmark for Evaluating Tool Orchestration within the {MCP} Ecosystem",
author = "Dong, Jia-Kai and
Huang, I-Wei and
Wu, Chun-Tin and
Tsai, Yi-tien",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.75/",
pages = "1453--1488",
ISBN = "979-8-89176-386-9",
abstract = "We introduce ETOM, a five-level benchmark for evaluating multi-hop, end-to-end tool orchestration by LLM agents within a hierarchical Model-Context Protocol (MCP) ecosystem. Existing benchmarks often assess tools in isolation, overlooking challenges such as functional overlap and cross-server orchestration, which can lead to overly optimistic evaluations. ETOM addresses these gaps by constructing ground truth through ``equal function sets'', enabling objective metrics such as F1 score and reducing reliance on LLM-as-a-judge evaluation. Its five-level curriculum systematically tests agent capabilities, from single-tool orchestration to complex cross-server planning, as well as robustness to out-of-scope requests. Experiments reveal that rigid hierarchies can hinder performance without co-designed strategies, and even state-of-the-art agents exhibit systemic weaknesses in robustness. ETOM provides a diagnostic framework to expose these limitations and guide the development of more capable and efficient tool-using agents."
}Markdown (Informal)
[ETOM: A Five-Level Benchmark for Evaluating Tool Orchestration within the MCP Ecosystem](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.75/) (Dong et al., Findings 2026)
ACL