@inproceedings{li-etal-2026-hisa,
title = "{H}i{SA}: Hierarchical State Abstraction for Scalable {GUI} Agents",
author = "Li, Weiming and
Paik, Hye-young and
Sui, Yulei",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.581/",
pages = "11965--11985",
ISBN = "979-8-89176-395-1",
abstract = "Multimodal GUI agents generally operate on raw visual and textual observations, which creates a fundamental scalability challenge. While current state-of-the-art frameworks predominantly rely on inference-intensive test-time scaling or the accumulation of unbounded raw logs to maintain task coherence, we attribute the underlying bottleneck to insufficient state abstraction.To address this, we propose HiSA, a hierarchical state abstraction approach that actively restructures knowledge rather than passively retaining historical information by organizing raw histories into a three-level hierarchy of abstracted steps, refined contexts, and induced patterns.By synthesizing high-dimensional observations into compact semantic states, HiSA decouples reasoning efficacy from context length, enabling precise and scalable decision-making as interaction histories grow.When evaluating using Spider2-V, our approach establishes a new state-of-the-art, achieving a 40.58{\%} success rate while reducing token consumption by 69.85{\%} and monetary costs by 55.10{\%} compared to the best-performing baseline."
}Markdown (Informal)
[HiSA: Hierarchical State Abstraction for Scalable GUI Agents](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.581/) (Li et al., Findings 2026)
ACL