@inproceedings{koh-etal-2024-visualwebarena, title = "{V}isual{W}eb{A}rena: Evaluating Multimodal Agents on Realistic Visual Web Tasks", author = "Koh, Jing Yu and Lo, Robert and Jang, Lawrence and Duvvur, Vikram and Lim, Ming and Huang, Po-Yu and Neubig, Graham and Zhou, Shuyan and Salakhutdinov, Russ and Fried, Daniel", editor = "Ku, Lun-Wei and Martins, Andre and Srikumar, Vivek", booktitle = "Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)", month = aug, year = "2024", address = "Bangkok, Thailand", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.acl-long.50/", doi = "10.18653/v1/2024.acl-long.50", pages = "881--905" }