@inproceedings{sadhu-etal-2025-vestabench,
title = "{V}esta{B}ench: An Embodied Benchmark for Safe Long-Horizon Planning Under Multi-Constraint and Adversarial Settings",
author = "Sadhu, Tanmana and
Chen, Yanan and
Pesaranghader, Ali",
editor = "Potdar, Saloni and
Rojas-Barahona, Lina and
Montella, Sebastien",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: Industry Track",
month = nov,
year = "2025",
address = "Suzhou (China)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-industry.149/",
pages = "2122--2145",
ISBN = "979-8-89176-333-3",
abstract = "Large language models (LLMs) are applied to reasoning and (automated) planning across diverse domains, from travel itineraries to embodied AI tasks. However, concerns have been raised about their suitability for long-horizon tasks involving multiple constraints, as they are prone to hallucinations, particularly in adversarial scenarios. Safety reasoning also becomes critical for embodied AI agents, which interact with their physical environments to complete tasks on behalf of humans. However, existing (safety) benchmarks fail to represent a diverse range of multi-constraint tasks that require long-horizon planning with a focus on safety. To address this, we propose VESTABENCH, a benchmark curated using VirtualHome and BEHAVIOR-100. Our VESTABENCH includes (1) tasks that can be achieved safely under adversarial and multi-constraint settings, as well as (2) adversarial instructions that the agent must avoid. Our experiments with state-of-the-art LLM-based baselines reveal that they perform poorly against our tasks, not only achieving low success rates but also suffering significantly compromised safety outcomes. This observation reinforces the limitations of LLMs in generating safe plans when faced with adversarial settings or instructions. Finally, we believe that our findings benefit the research and industry communities."
}Markdown (Informal)
[VestaBench: An Embodied Benchmark for Safe Long-Horizon Planning Under Multi-Constraint and Adversarial Settings](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-industry.149/) (Sadhu et al., EMNLP 2025)
ACL