@inproceedings{lee-kim-2026-sage,
title = "{SAGE}: Sign-Adaptive Gradient for Memory-Efficient {LLM} Optimization",
author = "Lee, Wooin and
Kim, Hyun-Tae",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.923/",
pages = "18525--18537",
ISBN = "979-8-89176-395-1",
abstract = "The AdamW optimizer, while standard for LLM pretraining, is a critical memory bottleneck, consuming optimizer states equivalent to twice the model{'}s size. Although light-state optimizers like SinkGD attempt to address this issue, we identify the embedding layer dilemma: these methods fail to handle the sparse, high-variance gradients inherent to embeddings, forcing a hybrid design that reverts to AdamW and partially negates the memory gains. We propose SAGE (Sign Adaptive GradiEnt), a novel optimizer that resolves this dilemma by replacing AdamW in this hybrid structure. SAGE combines a Lion-style update direction with a new, memory-efficient $O(d)$ adaptive scale. This scale acts as a ``safe damper,'' provably bounded by 1.0, which tames high-variance dimensions more effectively than existing methods. This superior stability allows SAGE to achieve better convergence. On Llama models up to 1.3B parameters, our SAGE-based hybrid achieves new state-of-the-art perplexity, outperforming all baselines, including SinkGD hybrid, while significantly reducing optimizer state memory."
}Markdown (Informal)
[SAGE: Sign-Adaptive Gradient for Memory-Efficient LLM Optimization](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.923/) (Lee & Kim, Findings 2026)
ACL