@inproceedings{singh-etal-2023-viphy,
title = "{VIPHY}: Probing ``Visible'' Physical Commonsense Knowledge",
author = "Singh, Shikhar and
Qasemi, Ehsan and
Chen, Muhao",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.473/",
doi = "10.18653/v1/2023.findings-emnlp.473",
pages = "7113--7128",
abstract = "Vision-language models (VLMs) have shown remarkable performance on visual reasoning tasks (e.g. attributes, location). While such tasks measure the requisite knowledge to ground and reason over a given visual instance, they do not, however, measure the ability of VLMs to retain and generalize such knowledge. In this work, we evaluate VLMs' ability to acquire ``visible'' physical knowledge {--} the information that is easily accessible from images of static scenes, particularly along the dimensions of object color, size, and space. We build an automatic pipeline to derive a comprehensive knowledge resource for calibrating and probing these models. Our results indicate a severe gap between model and human performance across all three dimensions. Furthermore, we demonstrate that a caption pretrained LM significantly outperforms VLMs on both size and spatial tasks {--} highlighting that despite sufficient access to ground language with visual modality, they struggle to retain such knowledge."
}
Markdown (Informal)
[VIPHY: Probing “Visible” Physical Commonsense Knowledge](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.473/) (Singh et al., Findings 2023)
ACL