@inproceedings{li-etal-2025-vocot,
    title = "{V}o{C}o{T}: Unleashing Visually Grounded Multi-Step Reasoning in Large Multi-Modal Models",
    author = "Li, Zejun  and
      Luo, Ruipu  and
      Zhang, Jiwen  and
      Qiu, Minghui  and
      Huang, Xuanjing  and
      Wei, Zhongyu",
    editor = "Chiruzzo, Luis  and
      Ritter, Alan  and
      Wang, Lu",
    booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = apr,
    year = "2025",
    address = "Albuquerque, New Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.192/",
    pages = "3769--3798",
    ISBN = "979-8-89176-189-6"
}