@inproceedings{fu-etal-2026-flare,
title = "{FLARE}: Fine-Grained Length-Aware Routing for Resource-Efficient Heterogeneous {LLM} Serving",
author = "Fu, Yujia and
Zhong, Heming and
Huang, Dan and
Lu, Yutong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1018/",
pages = "22249--22266",
ISBN = "979-8-89176-390-6",
abstract = "With the rapid proliferation of large language models (LLMs), model pools have become increasingly heterogeneous in both capability and efficiency. Larger LLMs can improve quality but incur higher latency and cost, while smaller LLMs are the opposite, making per-query model selection crucial in practice. This has spawned LLM routers that dispatch each query to an appropriate model. Existing routers lack fine-grained resource awareness across deployment settings, which degrades efficiency metrics in real-world serving. To this end, We propose FLARE, a length-centric, resource-aware multi-LLM routing framework that uses length-based models to estimate per-query latency and cost. FLARE formulates routing as a discrete multi-objective optimization problem to achieve efficient trade-off. Experiments show that FLARE reduces latency and cost by up to 68{\%} and 75{\%} while maintaining competitive accuracy, and can be easily applied to new datasets and LLMs."
}Markdown (Informal)
[FLARE: Fine-Grained Length-Aware Routing for Resource-Efficient Heterogeneous LLM Serving](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1018/) (Fu et al., ACL 2026)
ACL