@inproceedings{liu-etal-2025-scaling,
title = "Scaling up the State Size of {RNN} {LLM}s for Long-Context Scenarios",
author = "Liu, Kai and
Gao, Jianfei and
Chen, Kai",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.564/",
pages = "11516--11529",
ISBN = "979-8-89176-251-0",
abstract = "The Transformer architecture has become the standard LLM architecture due to its powerful self-attention mechanism. However, it suffers from quadratic computational complexity and linear memory complexity. RNN-based LLMs have been proposed as alternatives. Yet, RNN models struggle in long-context scenarios, making it challenging to replace self-attention with RNNs. We identify the state size as a critical bottleneck, which is significantly smaller than that of Transformers with a basic context length of 2k. However, simply increasing the state size significantly raises the number of parameters and lowers training efficiency. In this paper, we propose an efficient scaling method to scale the state size of RNN models to match the 2k context length of Transformers, with small parameters overhead. Experimental results demonstrate that scaling the state size significantly enhances long-context understanding. Retrieval performance scales almost linearly with state size, with a 454M model featuring an expanded state achieving performance comparable to a 1.47B model on FDA, a recall-intensive task. These findings highlight state scaling as a promising approach for advancing RNN-based LLMs."
}
Markdown (Informal)
[Scaling up the State Size of RNN LLMs for Long-Context Scenarios](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.564/) (Liu et al., ACL 2025)
ACL