@inproceedings{zhang-ilvovsky-2026-thesis,
title = "Thesis Proposal: Efficient {KV} Cache Reuse for Multi-Document Retrieval-Augmented Generation",
author = "Zhang, Zhipeng and
Ilvovsky, Dmitry",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.11/",
pages = "160--169",
ISBN = "979-8-89176-383-8",
abstract = "Retrieval-Augmented Generation (RAG) systems face efficiency bottlenecks in prefill due to attention mechanism, and traditional KV cache only accelerates decoding. In this context, reusing document-level KV cache computed for retrieved documents in previous sessions during the prefill stage appears to be a natural way to amortize computation, but it raises serious correctness challenges due to position and context misalignment across queries and sessions. This research proposes a multi-document KV cache reuse framework for multi-document RAG workloads across queries and sessions to resolve position misalignment and context misalignment, preserving accuracy while eliminating document-specific quadratic complexity in prefill. Theoretical analysis will establish conditions under which multi-document KV cache reuse remains stable and close to full recomputation, providing principled guarantees for both efficiency and accuracy. These results will enable deployment in existing RAG pipelines without architectural changes or model retraining. Crucially, to ensure robustness in real-world deployments, validation will extend beyond standard benchmarks to include noise-robustness tests and domain-specific workloads (e.g., legal). The research aims to empirically confirm these guarantees and demonstrate that substantial prefill speedups can be achieved without materially degrading task-level performance."
}Markdown (Informal)
[Thesis Proposal: Efficient KV Cache Reuse for Multi-Document Retrieval-Augmented Generation](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.11/) (Zhang & Ilvovsky, EACL 2026)
ACL