@inproceedings{liu-etal-2024-beyond-text,
title = "Beyond Text: Unveiling Multimodal Proficiency of Large Language Models with {M}ulti{API} Benchmark",
author = "Liu, Xiao and
Lin, Jianfeng and
Zhang, Jiawei",
editor = "Li, Sha and
Li, Manling and
Zhang, Michael JQ and
Choi, Eunsol and
Geva, Mor and
Hase, Peter and
Ji, Heng",
booktitle = "Proceedings of the 1st Workshop on Towards Knowledgeable Language Models (KnowLLM 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.knowllm-1.4/",
doi = "10.18653/v1/2024.knowllm-1.4",
pages = "32--44",
abstract = "The proliferation of Large Language Models like ChatGPT has significantly advanced language understanding and generation, impacting a broad spectrum of applications. However, these models predominantly excel in text-based tasks, overlooking the complexity of real-world multimodal information. This study introduces \textbf{MultiAPI}, a pioneering comprehensive large-scale API benchmark dataset aimed at expanding LLMs' proficiency in multimodal contexts. Developed collaboratively through ChatGPT, \textbf{MultiAPI} consists of 187 diverse API calls and 1,799 contextual prompts, offering a unique platform evaluation of tool-augmented LLMs handling multimodal tasks. Through comprehensive experiments, our findings reveal that while LLMs demonstrate proficiency in API call decision-making, they face challenges in domain identification, function selection, and argument generation. What{'}s more, we surprisingly notice that auxiliary context can actually impair the performance. An in-depth error analysis paves the way for a new paradigm to address these challenges, suggesting a potential direction for future LLM research."
}
Markdown (Informal)
[Beyond Text: Unveiling Multimodal Proficiency of Large Language Models with MultiAPI Benchmark](https://preview.aclanthology.org/fix-sig-urls/2024.knowllm-1.4/) (Liu et al., KnowLLM 2024)
ACL