@inproceedings{hsiung-etal-2026-pgga,
title = "{PGGA}: A Plan-Grounded {GUI} Agent for Automated Device Support",
author = "Hsiung, Lei and
Chen, Zhiyu and
Kim, Seonhoon and
Liu, Qun",
editor = "Yan, Qianqi and
Montariol, Syrielle and
Fan, Yue and
Gu, Jing and
Pan, Jiayi and
Li, Manling and
Kordjamshidi, Parisa and
Suhr, Alane and
Wang, Xin Eric",
booktitle = "Proceedings of the 4th Workshop on Advances in Language and Vision Research ({ALVR})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.9/",
pages = "105--114",
ISBN = "979-8-89176-398-2",
abstract = "Current GUI agents struggle with multi-step digital device support. We investigate whether this failure is partly caused by a procedural knowledge deficit: agents often rely on zero-shot visual exploration instead of executing verified instructions. To address this, we introduce the Plan-Grounded GUI Agent (PGGA), framing interface navigation as a knowledge-execution problem by conditioning low-level actions on step-by-step text plans. Evaluated on our focused Device-Support Interaction Benchmark (DSIB), results reveal a sharp gap between knowing which operation to perform and grounding that operation on the screen: GTA1-7B reaches 99.59{\%} Operation Accuracy with expert plans, but only 82.99{\%} Element Accuracy and 45.61{\%} Task Success Rate; without plans, its Task Success Rate is 0.00{\%}. Our fine-tuned 2B-parameter PGGA achieves 54.39{\%} Task Success Rate and 91.28{\%} Element Accuracy when guided by expert plans, suggesting that explicit procedural grounding can substantially improve GUI execution when high-quality plans are available. Project Page: https://hsiung.cc/PGGA/"
}Markdown (Informal)
[PGGA: A Plan-Grounded GUI Agent for Automated Device Support](https://preview.aclanthology.org/ingest-acl-workshops/2026.alvr-main.9/) (Hsiung et al., ALVR 2026)
ACL