@inproceedings{wu-tu-2024-layer,
title = "Layer-Condensed {KV} Cache for Efficient Inference of Large Language Models",
author = "Wu, Haoyi and
Tu, Kewei",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.602/",
doi = "10.18653/v1/2024.acl-long.602",
pages = "11175--11188",
abstract = "Huge memory consumption has been a major bottleneck for deploying high-throughput large language models in real-world applications. In addition to the large number of parameters, the key-value (KV) cache for the attention mechanism in the transformer architecture consumes a significant amount of memory, especially when the number of layers is large for deep language models. In this paper, we propose a novel method that only computes and caches the KVs of a small number of layers, thus significantly saving memory consumption and improving inference throughput. Our experiments on large language models show that our method achieves up to 26$\times$ higher throughput than standard transformers and competitive performance in language modeling and downstream tasks. In addition, our method is orthogonal to existing transformer memory-saving techniques, so it is straightforward to integrate them with our model, achieving further improvement in inference efficiency. Our code is available at https://github.com/whyNLP/LCKV."
}
Markdown (Informal)
[Layer-Condensed KV Cache for Efficient Inference of Large Language Models](https://preview.aclanthology.org/fix-sig-urls/2024.acl-long.602/) (Wu & Tu, ACL 2024)
ACL