@inproceedings{zhao-etal-2025-cola,
title = "{COLA}: Collaborative Multi-Agent Framework with Dynamic Task Scheduling for {GUI} Automation",
author = "Zhao, Di and
Ma, Longhui and
Wang, Siwei and
Wang, Miao and
Lv, Zhao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.227/",
pages = "4570--4593",
ISBN = "979-8-89176-332-6",
abstract = "With the rapid advancements in Large Language Models (LLMs), an increasing number of studies have leveraged LLMs as the cognitive core of agents to address complex task decision-making challenges. Specially, recent research has demonstrated the potential of LLM-based agents on automating GUI operations. However, existing methodologies exhibit two critical challenges: (1) static agent architectures struggle to adapt to diverse GUI application scenarios, leading to inadequate scenario generalization; (2) the agent workflows lack fault tolerance mechanism, necessitating complete process re-execution for GUI agent decision error. To address these limitations, we introduce COLA, a collaborative multi-agent framework for automating GUI operations. In this framework, a scenario-aware agent Task Scheduler decomposes task requirements into atomic capability units, dynamically selects the optimal agent from a decision agent pool, effectively responds to the capability requirements of diverse scenarios. Furthermore, we develop an interactive backtracking mechanism that enables human to intervene to trigger state rollbacks for non-destructive process repair. Experiments on the GAIA dataset show that COLA achieves competitive performance among GUI Agent methods, with an average accuracy of 31.89{\%}. On WindowsAgentArena, it performs particularly well in Web Browser (33.3{\%}), Media {\&} Video (33.3{\%}), and Windows Utils (25.0{\%}), suggesting the effectiveness of specialized agent design and dynamic strategy allocation. The code is available at https://github.com/Alokia/COLA-demo."
}Markdown (Informal)
[COLA: Collaborative Multi-Agent Framework with Dynamic Task Scheduling for GUI Automation](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.227/) (Zhao et al., EMNLP 2025)
ACL