@inproceedings{oneata-etal-2025-seeing,
title = "Seeing What Tastes Good: Revisiting Multimodal Distributional Semantics in the Billion Parameter Era",
author = "Oneata, Dan and
Elliott, Desmond and
Frank, Stella",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1240/",
pages = "24174--24191",
ISBN = "979-8-89176-256-5",
abstract = "Human learning and conceptual representation is grounded in sensorimotor experience, in contrast to state-of-the-art foundation models. In this paper, we investigate how well such large-scale models, trained on vast quantities of data, represent the semantic feature norms of concrete object concepts, e.g. a ROSE is red, smells sweet, and is a flower. More specifically, we use probing tasks to test which properties of objects these models are aware of. We evaluate image encoders trained on image data alone, as well as multimodally-trained image encoders and language-only models, on predicting an extended denser version of the classic McRae norms and the newer Binder dataset of attribute ratings. We find that multimodal image encoders slightly outperform language-only approaches, and that image-only encoders perform comparably to the language models, even on non-visual attributes that are classified as ``encyclopedic'' or ``function''. These results offer new insights into what can be learned from pure unimodal learning, and the complementarity of the modalities."
}
Markdown (Informal)
[Seeing What Tastes Good: Revisiting Multimodal Distributional Semantics in the Billion Parameter Era](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1240/) (Oneata et al., Findings 2025)
ACL