@article{xu-etal-2025-crvq,
title = "{CRVQ}: Channel-Relaxed Vector Quantization for Extreme Compression of {LLM}s",
author = "Xu, Yuzhuang and
Ji, Shiyu and
Zhu, Qingfu and
Che, Wanxiang",
journal = "Transactions of the Association for Computational Linguistics",
volume = "13",
year = "2025",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/ingest-eacl/2025.tacl-1.68/",
doi = "10.1162/tacl.a.45",
pages = "1488--1506",
abstract = "Powerful large language models (LLMs) are increasingly expected to be deployed with lower computational costs, enabling their capabilities on resource-constrained devices. Post-training quantization (PTQ) has emerged as a star approach to achieve this ambition, with best methods compressing weights to less than 2 bit on average. In this paper, we propose Channel-Relaxed Vector Quantization (CRVQ), a novel technique that significantly improves the performance of PTQ baselines at the cost of only minimal additional bits. This state-of-the-art extreme compression method achieves its results through two key innovations: (1) carefully selecting and reordering a very small subset of critical weight channels, and (2) leveraging extended codebooks to relax the constraint of critical channels. With our method, we demonstrate a 38.9{\%} improvement over the current strongest sub-2-bit PTQ baseline, enabling nearer lossless 1-bit compression. Furthermore, our approach offers flexible customization of quantization bit-width and performance, providing a wider range of deployment options for diverse hardware platforms. Code and checkpoints are available at https://github.com/xuyuzhuang11/CRVQ."
}Markdown (Informal)
[CRVQ: Channel-Relaxed Vector Quantization for Extreme Compression of LLMs](https://preview.aclanthology.org/ingest-eacl/2025.tacl-1.68/) (Xu et al., TACL 2025)
ACL