@inproceedings{wen-etal-2025-program,
title = "Program Synthesis Benchmark for Visual Programming in {XL}ogo{O}nline Environment",
author = "Wen, Chao and
Staub, Jacqueline and
Singla, Adish",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.769/",
pages = "15812--15838",
ISBN = "979-8-89176-251-0",
abstract = "Large language and multimodal models have shown remarkable success on various benchmarks focused on specific skills such as general-purpose programming, math word problem-solving, and visual question answering. However, it is unclear how well these models perform on tasks that require a combination of these skills. In this paper, we curate a novel program synthesis benchmark based on the real-world tasks in the XLogoOnline visual programming environment. Each task requires a combination of different skills such as spatial planning, basic programming, and logical reasoning. Our evaluation shows that current state-of-the-art models like GPT-4V and Llama3-70B struggle to solve these tasks, achieving only 20{\%} and 2.35{\%} success rates, respectively. Next, we develop a fine-tuning pipeline to boost the performance of models by leveraging a large-scale synthetic training dataset with over 80,000 tasks. Moreover, we showcase how emulator-driven feedback can be used to design a curriculum over training data distribution, through which a fine-tuned Llama3-8B drastically outperforms GPT-4V and Llama3-70B models. Finally, we provide an in-depth failure analysis to understand the limitations of different models. We will publicly release the benchmark for future research on program synthesis in visual programming."
}
Markdown (Informal)
[Program Synthesis Benchmark for Visual Programming in XLogoOnline Environment](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.769/) (Wen et al., ACL 2025)
ACL