@inproceedings{he-etal-2025-seeing,
title = "Seeing Through Words, Speaking Through Pixels: Deep Representational Alignment Between Vision and Language Models",
author = "He, Zoe Wanying and
Trott, Sean and
Khosla, Meenakshi",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1806/",
pages = "35645--35660",
ISBN = "979-8-89176-332-6",
abstract = "Recent studies show that deep vision-only and language-only models{---}trained on disjoint modalities{---}nonetheless project their inputs into a partially aligned representational space. Yet we still lack a clear picture of {\_}where{\_} in each network this convergence emerges, {\_}what{\_} visual or linguistic cues support it, {\_}whether{\_} it captures human preferences in many-to-many image-text scenarios, and {\_}how{\_} aggregating exemplars of the same concept affects alignment. Here, we systematically investigate these questions. We find that alignment peaks in mid-to-late layers of both model types, reflecting a shift from modality-specific to conceptually shared representations. This alignment is robust to appearance-only changes but collapses when semantics are altered (e.g., object removal or word-order scrambling), highlighting that the shared code is truly semantic. Moving beyond the one-to-one image-caption paradigm, a forced-choice ``Pick-a-Pic'' task shows that human preferences for image-caption matches are mirrored in the embedding spaces across all vision-language model pairs. This pattern holds bidirectionally when multiple captions correspond to a single image, demonstrating that models capture fine-grained semantic distinctions akin to human judgments. Surprisingly, averaging embeddings across exemplars amplifies alignment rather than blurring detail. Together, our results demonstrate that unimodal networks converge on a shared semantic code that aligns with human judgments and strengthens with exemplar aggregation."
}Markdown (Informal)
[Seeing Through Words, Speaking Through Pixels: Deep Representational Alignment Between Vision and Language Models](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1806/) (He et al., EMNLP 2025)
ACL