@inproceedings{zhou-etal-2025-value,
title = "Value Residual Learning",
author = "Zhou, Zhanchao and
Wu, Tianyi and
Jiang, Zhiyun and
Obeid, Fares and
Lan, Zhenzhong",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1375/",
pages = "28341--28356",
ISBN = "979-8-89176-251-0",
abstract = "While Transformer models have achieved remarkable success in various domains, the effectiveness of information propagation through deep networks remains a critical challenge. Standard hidden state residuals often fail to adequately preserve initial token-level information in deeper layers. This paper introduces ResFormer, a novel architecture that enhances information flow by incorporating value residual connections in addition to hidden state residuals. And a variant is SVFormer, where all layers share the first layer{'}s value embedding. Comprehensive empirical evidence demonstrates ResFormer achieves equivalent validation loss with 16.11{\%} fewer model parameters and 20.3{\%} less training data compared to Transformer, while maintaining similar memory usage and computational cost. Besides, SVFormer reduces KV cache size by nearly half with only a small performance penalty and can be integrated with other KV-efficient methods, yielding further reductions in KV cache, with performance influenced by sequence length and cumulative learning rate."
}
Markdown (Informal)
[Value Residual Learning](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1375/) (Zhou et al., ACL 2025)
ACL
- Zhanchao Zhou, Tianyi Wu, Zhiyun Jiang, Fares Obeid, and Zhenzhong Lan. 2025. Value Residual Learning. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 28341–28356, Vienna, Austria. Association for Computational Linguistics.