@inproceedings{zhao-etal-2025-urbanvideo, title = "{U}rban{V}ideo-Bench: Benchmarking Vision-Language Models on Embodied Intelligence with Video Data in Urban Spaces", author = "Zhao, Baining and Fang, Jianjie and Dai, Zichao and Wang, Ziyou and Zha, Jirong and Zhang, Weichen and Gao, Chen and Wang, Yue and Cui, Jinqiang and Chen, Xinlei and Li, Yong", editor = "Che, Wanxiang and Nabende, Joyce and Shutova, Ekaterina and Pilehvar, Mohammad Taher", booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", month = jul, year = "2025", address = "Vienna, Austria", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/landing_page/2025.acl-long.1558/", pages = "32400--32423", ISBN = "979-8-89176-251-0" }