@inproceedings{zhao-etal-2025-urbanvideo,
    title = "{U}rban{V}ideo-Bench: Benchmarking Vision-Language Models on Embodied Intelligence with Video Data in Urban Spaces",
    author = "Zhao, Baining  and
      Fang, Jianjie  and
      Dai, Zichao  and
      Wang, Ziyou  and
      Zha, Jirong  and
      Zhang, Weichen  and
      Gao, Chen  and
      Wang, Yue  and
      Cui, Jinqiang  and
      Chen, Xinlei  and
      Li, Yong",
    editor = "Che, Wanxiang  and
      Nabende, Joyce  and
      Shutova, Ekaterina  and
      Pilehvar, Mohammad Taher",
    booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
    month = jul,
    year = "2025",
    address = "Vienna, Austria",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/landing_page/2025.acl-long.1558/",
    pages = "32400--32423",
    ISBN = "979-8-89176-251-0"
}