@inproceedings{li-etal-2025-efficient-ensemble,
title = "Efficient Ensemble for Fine-tuning Language Models on Multiple Datasets",
author = "Li, Dongyue and
Zhang, Ziniu and
Wang, Lu and
Zhang, Hongyang R.",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1231/",
pages = "25347--25364",
ISBN = "979-8-89176-251-0",
abstract = "This paper develops an ensemble method for fine-tuning a language model to multiple datasets. Existing methods, such as quantized LoRA (QLoRA), are efficient when adapting to a single dataset. When training on multiple datasets of different tasks, a common setup in practice, it remains unclear how to design an efficient adaptation for fine-tuning language models. We propose to use an ensemble of multiple smaller adapters instead of a single adapter per task. We design an efficient algorithm that partitions $n$ datasets into $m$ groups, where $m$ is typically much smaller than $n$ in practice, and train one adapter for each group before taking a weighted combination to form the ensemble. The algorithm leverages a first-order approximation property of low-rank adaptation to quickly obtain the fine-tuning performances of dataset combinations since methods like LoRA stay close to the base model. Hence, we use the gradients of the base model to estimate its behavior during fine-tuning. Empirically, this approximation holds with less than 1{\%} error on models with up to 34 billion parameters, leading to an estimation of true fine-tuning performances under 5{\%} error while speeding up computation compared to base fine-tuning by 105 times. When applied to fine-tune Llama and GPT models on ten text classification tasks, our approach provides up to 10{\%} higher average test accuracy over QLoRA, with only 9{\%} more FLOPs. On a Llama model with 34 billion parameters, an ensemble of QLoRA increases test accuracy by 3{\%} compared to QLoRA, with only 8{\%} more FLOPs."
}
Markdown (Informal)
[Efficient Ensemble for Fine-tuning Language Models on Multiple Datasets](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1231/) (Li et al., ACL 2025)
ACL