@inproceedings{soh-etal-2025-use,
title = "You Only Use Reactive Attention Slice When Retrieving From Long Context",
author = "Soh, Yun Joon and
Huang, Hanxian and
Tian, Yuandong and
Zhao, Jishen",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1125/",
doi = "10.18653/v1/2025.findings-emnlp.1125",
pages = "20665--20686",
ISBN = "979-8-89176-335-7",
abstract = "Retrieval-Augmented Generation is a powerful method for enhancing language models (LMs), but existing retrieval techniques are limited.Embedding-based methods are often inaccurate due to their reliance on lexical similarity, while neural retrievers are computationally expensive to train.To overcome these issues, we introduce You Only Use Reactive Attention slice (YOURA), a training-free and fine-tuning-free attention-based retrieval technique. When retrieving, YOURA uses a novel reaction score heuristic, which quantifies how an LM{'}s self-attention ``reacts'' to a user query. We also propose a sentence extraction algorithm to efficiently preprocess the context.Evaluations on three open-source LMs using the LongBench and BABILong datasets show YOURA{'}s effectiveness. Our framework improves QA task accuracy by up to 15{\%} and inference throughput by up to 31{\%} compared to embedding-based retrieval."
}Markdown (Informal)
[You Only Use Reactive Attention Slice When Retrieving From Long Context](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1125/) (Soh et al., Findings 2025)
ACL