@inproceedings{ogezi-etal-2024-semantically,
title = "Semantically-Prompted Language Models Improve Visual Descriptions",
author = "Ogezi, Michael and
Hauer, Bradley and
Kondrak, Grzegorz",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-naacl.267/",
doi = "10.18653/v1/2024.findings-naacl.267",
pages = "4285--4302",
abstract = "Language-vision models like CLIP have made significant strides in vision tasks, such as zero-shot image classification (ZSIC). However, generating specific and expressive visual descriptions remains challenging; descriptions produced by current methods are often ambiguous and lacking in granularity. To tackle these issues, we propose V-GLOSS: Visual Glosses, a novel method built upon two key ideas. The first is Semantic Prompting, which conditions a language model on structured semantic knowledge. The second is a new contrastive algorithm that elicits fine-grained distinctions between similar concepts. With both ideas, we demonstrate that V-GLOSS improves visual descriptions and achieves strong results in the zero-shot setting on general and fine-grained image-classification datasets, including ImageNet, STL-10, FGVC Aircraft, and Flowers 102. Moreover, these descriptive capabilities contribute to enhancing image-generation performance. Finally, we introduce a quality-tested silver dataset with descriptions generated with V-GLOSS for all ImageNet classes."
}
Markdown (Informal)
[Semantically-Prompted Language Models Improve Visual Descriptions](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-naacl.267/) (Ogezi et al., Findings 2024)
ACL