@inproceedings{hou-etal-2026-flashmem,
title = "{F}lash{M}em: Distilling Intrinsic Latent Memory via Computation Reuse",
author = "Hou, Yubo and
Chen, Zhisheng and
Wan, Tao and
Qin, Zengchang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.230/",
pages = "4687--4705",
ISBN = "979-8-89176-395-1",
abstract = "The stateless architecture of Large Language Models inherently lacks the mechanism to preserve dynamic context, compelling agents to redundantly reprocess history to maintain long-horizon autonomy. While latent memory offers a solution, current approaches are hindered by architectural segregation, relying on auxiliary encoders that decouple memory from the reasoning backbone. We propose \textbf{FlashMem}, a framework that distills intrinsic memory directly from transient reasoning states via computation reuse. Leveraging the property that internal representations uniquely encode input trajectories, FlashMem identifies the last hidden state as a sufficient statistic for the interaction history. This enables a \textbf{Shared-KV Consolidator} to synthesize memory by attending directly to the backbone{'}s frozen cache, eliminating redundant re-parameterization. Furthermore, a parameter-free \textbf{Cognitive Monitor} leverages attention entropy to adaptively trigger consolidation only when high epistemic uncertainty is detected. Experiments demonstrate that FlashMem matches the performance of heavy baselines while reducing inference latency by \textbf{5 times}, effectively bridging the gap between efficiency and persistent cognition."
}Markdown (Informal)
[FlashMem: Distilling Intrinsic Latent Memory via Computation Reuse](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.230/) (Hou et al., Findings 2026)
ACL