@inproceedings{shi-etal-2026-robsa,
title = "{R}o{BSA}: {R}o{PE}-based Blockwise Sparse Multi-head Latent Attention",
author = "Shi, Xinyu and
Luo, Kairong and
Zheng, Zhen and
Chen, Wenguang",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.46/",
pages = "1028--1044",
ISBN = "979-8-89176-390-6",
abstract = "Large Language Models (LLMs) have rapidly advanced in recent years, scaling up in both parameter count and context length. However, as context windows extend from thousands to hundreds of thousands of tokens, attention computation becomes the dominant source of memory usage and runtime in decoding stages, severely limiting the efficiency and scalability of long-context LLMs. Sparse attention has emerged as a promising solution, reducing complexity by computing attention over only a subset of context tokens. However, the sparse attention for Multi-head Latent Attention(MLA) which is a variant of standard MHA is rarely studied. In this paper, we introduce RoPE-based Blockwise Sparse Attention (RoBSA), a method designed specifically for MLA during the decoding stage of model inference. RoBSA leverages the decoupled nature of RoPE within MLA to implement token selection in a blockwise manner. RoBSA is a lightweight, training-free, and layer-aware algorithm that can be integrated in a plug-and-play fashion. Our method significantly reduces end-to-end inference latency in the decoding stage by up to 2.55x with minimal accuracy loss compared to full attention in long-context scenarios for very large models."
}Markdown (Informal)
[RoBSA: RoPE-based Blockwise Sparse Multi-head Latent Attention](https://preview.aclanthology.org/ingest-acl/2026.acl-long.46/) (Shi et al., ACL 2026)
ACL
- Xinyu Shi, Kairong Luo, Zhen Zheng, and Wenguang Chen. 2026. RoBSA: RoPE-based Blockwise Sparse Multi-head Latent Attention. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 1028–1044, San Diego, California, United States. Association for Computational Linguistics.