@inproceedings{kim-ji-2024-finer,
title = "Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models",
author = "Kim, Jeonghwan and
Ji, Heng",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.356/",
doi = "10.18653/v1/2024.emnlp-main.356",
pages = "6187--6207",
abstract = "Recent advances in instruction-tuned Large Vision-Language Models (LVLMs) have imbued the models with the ability to generate high-level, image-grounded explanations with ease. While such capability is largely attributed to the rich world knowledge contained within the Large Language Models (LLMs), our work reveals their shortcomings in fine-grained visual categorization (FGVC) across six different benchmark settings. Most recent state-of-the-art LVLMs such as LLaVa-1.5, InstructBLIP and GPT-4V not only severely deteriorate in terms of classification performance, e.g., average drop of 65.58 in EM for Stanford Dogs for LLaVA-1.5, but also struggle to generate descriptive visual attributes based on a concept that appears within an input image despite their prominent zero-shot image captioning ability. In-depth analyses show that instruction-tuned LVLMs suffer from modality gap, showing discrepancy when given textual and visual inputs that correspond to the same concept. In an effort to further the community`s endeavor in this direction, we propose a multiple granularity attribute-centric benchmark and training mixture, Finer, which aims to establish a ground to evaluate LVLMs' fine-grained visual comprehension ability and provide significantly improved explainability."
}
Markdown (Informal)
[Finer: Investigating and Enhancing Fine-Grained Visual Concept Recognition in Large Vision Language Models](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.356/) (Kim & Ji, EMNLP 2024)
ACL