@inproceedings{zhao-zhu-2025-skyllm,
title = "{S}ky{LLM}: Cross-{LLM}-{API}s Federation for Cost-effective Query Processing",
author = "Zhao, Heng and
Zhu, Yifei",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.1073/",
pages = "20864--20873",
ISBN = "979-8-89176-256-5",
abstract = "Large language models (LLMs) have demonstrated exceptional capabilities across a wide range of tasks, from text generation to complex problem-solving. LLM APIs provide easy access to these models by streamlining deployment and usage. Combining LLMs with complementary strengths has been shown to yield substantial performance gains over a monolithic LLM. However, invoking a fixed set of LLM APIs for each query incurs higher API costs and increased inference latency. To address these limitations, we propose SkyLLM, a system composed of a set of estimators and an API selector, which federates multiple LLM APIs and dynamically assigns a non-empty subset of these APIs to each query prior to inference under cost and latency budgets. The selected subset consists of either a single LLM or multiple LLMs. A single LLM efficiently handles simple queries at low cost, whereas multiple LLMs are employed for more complex queries to overcome performance limitations. We evaluate SkyLLM against individual LLMs and representative ensemble LLM methods from the literature. SkyLLM achieves the highest accuracy under a high budget. It can also be cost-effective, matching the most accurate individual LLM while cutting costs by 67.8{\%}."
}
Markdown (Informal)
[SkyLLM: Cross-LLM-APIs Federation for Cost-effective Query Processing](https://preview.aclanthology.org/landing_page/2025.findings-acl.1073/) (Zhao & Zhu, Findings 2025)
ACL