@inproceedings{rafiuddin-khan-2025-learning,
title = "Learning What to Remember: Adaptive Probabilistic Memory Retention for Memory-Efficient Language Models",
author = "Rafiuddin, S M and
Khan, Muntaha Nujat",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.212/",
doi = "10.18653/v1/2025.findings-emnlp.212",
pages = "3969--3981",
ISBN = "979-8-89176-335-7",
abstract = "Transformer attention scales quadratically with sequence length $O(n^2)$, limiting long-context use. We propose \textit{Adaptive Retention}, a probabilistic, layer-wise token selection mechanism that learns which representations to keep under a strict global budget $M$. Retention is modeled with Bernoulli gates trained via a Hard-Concrete/variational relaxation and enforced with a simple top-$M$ rule at inference, making the method differentiable and drop-in for standard encoders. Across classification, extractive QA, and long-document summarization, keeping only 30{--}50{\%} of tokens preserves $\geq 95\%$ of full-model performance while cutting peak memory by $\sim 35${--}45{\%} and improving throughput by up to $\sim 1.8\times$. This architecture-agnostic approach delivers practical long-context efficiency without modifying base attention or task heads."
}Markdown (Informal)
[Learning What to Remember: Adaptive Probabilistic Memory Retention for Memory-Efficient Language Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.212/) (Rafiuddin & Khan, Findings 2025)
ACL