@inproceedings{zhong-etal-2025-quantized,
title = "Quantized Can Still Be Calibrated: A Unified Framework to Calibration in Quantized Large Language Models",
author = "Zhong, Mingyu and
Wang, Guanchu and
Chuang, Yu-Neng and
Zou, Na",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1473/",
pages = "30503--30517",
ISBN = "979-8-89176-251-0",
abstract = "Although weight quantization helps large language models (LLMs) in resource-constrained environments, its influence on the uncertainty calibration remains unexplored. To bridge this gap, we presents a comprehensive investigation of uncertainty calibration for quantized LLMs in this work. Specifically, we propose an analytic method to estimate the upper bound of calibration error (UBCE) for LLMs. Our method separately discusses the calibration error of the model{'}s correct and incorrect predictions, indicating a theoretical improvement of calibration error caused by the weight quantization. Our study demonstrates that quantized models consistently exhibit worse calibration performance than full-precision models, supported by consistent analysis across multiple LLMs and datasets. To address the calibration issues of quantized models, we propose a novel method of post calibration for recovering the calibration performance of quantized models through soft-prompt tuning. Specifically, we inject soft tokens to quantized models after the embedding layers, and optimize these tokens to recover the calibration error caused by the weight quantization. Experimental results on multiple datasets demonstrate our effectiveness in improving the uncertainty calibration of quantized LLMs, facilitating more reliable weight quantization in resource-constrained environments."
}
Markdown (Informal)
[Quantized Can Still Be Calibrated: A Unified Framework to Calibration in Quantized Large Language Models](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1473/) (Zhong et al., ACL 2025)
ACL