@inproceedings{huang-wu-2025-quaff,
title = "Quaff: Quantized Parameter-Efficient Fine-Tuning under Outlier Spatial Stability Hypothesis",
author = "Huang, Hong and
Wu, Dapeng",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.325/",
pages = "6481--6496",
ISBN = "979-8-89176-251-0",
abstract = "Large language models (LLMs) have made exciting achievements across various domains, yet their deployment on resource-constrained personal devices remains hindered by the prohibitive computational and memory demands of task-specific fine-tuning. While quantization offers a pathway to efficiency, existing methods struggle to balance performance and overhead, either incurring high computational/memory costs or failing to address activation outliers{---}a critical bottleneck in quantized fine-tuning. To address these challenges, we propose the Outlier Spatial Stability Hypothesis ({\_}{\_}OSSH{\_}{\_}): {\_}During fine-tuning, certain activation outlier channels retain stable spatial positions across training iterations.{\_} Building on OSSH, we propose {\_}{\_}Quaff{\_}{\_}, a Quantized parameter-efficient fine-tuning framework for LLMs, optimizing low-precision activation representations through targeted momentum scaling. Quaff dynamically suppresses outliers exclusively in invariant channels using lightweight operations, eliminating full-precision weight storage and global rescaling while reducing quantization errors. Extensive experiments across ten benchmarks validate OSSH and demonstrate Quaff{'}s efficacy. Specifically, on the GPQA reasoning benchmark, Quaff achieves a $1.73\times$ latency reduction and 30{\%} memory savings over full-precision fine-tuning while improving accuracy by 0.6{\%} on the Phi-3 model, reconciling the triple trade-off between efficiency, performance, and deployability. By enabling consumer-grade GPU fine-tuning (e.g., RTX 2080 Super) without sacrificing model utility, Quaff democratizes personalized LLM deployment. The code is available at https://anonymous.4open.science/r/Quaff-B322/."
}
Markdown (Informal)
[Quaff: Quantized Parameter-Efficient Fine-Tuning under Outlier Spatial Stability Hypothesis](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.325/) (Huang & Wu, ACL 2025)
ACL