@inproceedings{lee-etal-2024-collavo,
title = "{C}o{LL}a{VO}: Crayon Large Language and Vision m{O}del",
author = "Lee, Byung-Kwan and
Park, Beomchan and
Kim, Chae Won and
Ro, Yong Man",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-acl.66/",
doi = "10.18653/v1/2024.findings-acl.66",
pages = "1121--1138",
abstract = "The remarkable success of Large Language Models (LLMs) and instruction tuning drives the evolution of Vision Language Models (VLMs) towards a versatile general-purpose model. Yet, it remains unexplored whether current VLMs genuinely possess quality object-level image understanding capabilities determined from `what objects are in the image?' or `which object corresponds to a specified bounding box?'. Our findings reveal that the image understanding capabilities of current VLMs are strongly correlated with their zero-shot performance on vision language (VL) tasks. This suggests that prioritizing basic image understanding is crucial for VLMs to excel at VL tasks. To enhance object-level image understanding, we propose Crayon Large Language and Vision mOdel (CoLLaVO), which incorporates instruction tuning with Crayon Prompt as a new visual prompt tuning scheme based on panoptic color maps. Furthermore, we present a learning strategy of Dual QLoRA to preserve object-level image understanding without forgetting it during visual instruction tuning, thereby achieving a significant leap in numerous VL benchmarks in a zero-shot setting."
}
Markdown (Informal)
[CoLLaVO: Crayon Large Language and Vision mOdel](https://preview.aclanthology.org/fix-sig-urls/2024.findings-acl.66/) (Lee et al., Findings 2024)
ACL
- Byung-Kwan Lee, Beomchan Park, Chae Won Kim, and Yong Man Ro. 2024. CoLLaVO: Crayon Large Language and Vision mOdel. In Findings of the Association for Computational Linguistics: ACL 2024, pages 1121–1138, Bangkok, Thailand. Association for Computational Linguistics.