@inproceedings{liu-etal-2024-evaluation-mechanism,
title = "An Evaluation Mechanism of {LLM}-based Agents on Manipulating {API}s",
author = "Liu, Bing and
Jianxiang, Zhou and
Meng, Dan and
Lu, Haonan",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-emnlp.267/",
doi = "10.18653/v1/2024.findings-emnlp.267",
pages = "4649--4662",
abstract = "LLM-based agents can greatly extend the abilities of LLMs and thus attract sharply increased studies. An ambitious vision {--} serving users by manipulating massive API-based tools {--} has been proposed and explored. However, we find a widely accepted evaluation mechanism for generic agents is still missing. This work aims to fill this gap. We decompose tool use capability into seven aspects and form a thorough evaluation schema. In addition, we design and release an instruction dataset and a toolset {--} the two sides that the agents bridge between {--} following the principle of reflecting real-world challenges. Furthermore, we evaluate multiple generic agents. Our findings can inspire future research in improving LLM-based agents and rethink the philosophy of API design."
}
Markdown (Informal)
[An Evaluation Mechanism of LLM-based Agents on Manipulating APIs](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-emnlp.267/) (Liu et al., Findings 2024)
ACL