@inproceedings{peng-etal-2025-muzo,
title = "{MUZO}: Leveraging Multiple Queries and Momentum for Zeroth-Order Fine-Tuning of Large Language Models",
author = "Peng, Yuezhang and
Liu, Yuxin and
Wen, Fei and
Chen, Xie",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.432/",
pages = "8577--8595",
ISBN = "979-8-89176-332-6",
abstract = "Fine-tuning pre-trained large language models (LLMs) on downstream tasks has achieved significant success across various domains. However, as model sizes grow, traditional first-order fine-tuning algorithms incur substantial memory overhead due to the need for activation storage for back-propagation (BP). The BP-free Memory-Efficient Zeroth-Order Optimization (MeZO) method estimates gradients through finite differences, avoiding the storage of activation values, and has been demonstrated as a viable approach for fine-tuning large language models. This work proposes the Multiple-query Memory Efficient Zeroth-Order (MUZO) method, which is based on variance-reduced multiple queries to obtain the average of gradient estimates. When combined with Adam optimizer, MUZO-Adam demonstrates superior performance in fine-tuning various LLMs. Furthermore, we provide theoretical guarantees for the convergence of the MUZO-Adam optimizer. Extensive experiments empirically demonstrate that MUZO-Adam converges better than MeZO-SGD and achieves near first-order optimizer performance on downstream classification, multiple-choice, and generation tasks."
}Markdown (Informal)
[MUZO: Leveraging Multiple Queries and Momentum for Zeroth-Order Fine-Tuning of Large Language Models](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.432/) (Peng et al., EMNLP 2025)
ACL