@inproceedings{loakman-etal-2025-seeing,
title = "Seeing isn{'}t Hearing: Benchmarking Vision Language Models at Interpreting Spectrograms",
author = "Loakman, Tyler and
James, Joseph and
Lin, Chenghua",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-short.7/",
pages = "79--86",
ISBN = "979-8-89176-299-2",
abstract = "With the rise of Large Language Models (LLMs) and their vision-enabled counterparts (VLMs), numerous works have investigated their capabilities in different tasks that fuse both vision and language modalities. In this work, we benchmark the extent to which VLMs are able to act as highly-trained phoneticians, interpreting spectrograms and waveforms of speech. To do this, we synthesise a novel dataset containing 4k+ English words spoken in isolation alongside stylistically consistent spectrogram and waveform figures. We test the ability of VLMs to understand these representations of speech through a multiple-choice task whereby models must predict the correct phonemic or graphemic transcription of a spoken word when presented amongst 3 distractor transcriptions that have been selected based on their phonemic edit distance to the ground truth. We observe that both zero-shot and finetuned models rarely perform above chance, demonstrating the difficulty of this task stemming from the requirement for esoteric parametric knowledge of how to interpret such figures, rather than paired samples alone."
}Markdown (Informal)
[Seeing isn’t Hearing: Benchmarking Vision Language Models at Interpreting Spectrograms](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-short.7/) (Loakman et al., IJCNLP-AACL 2025)
ACL
- Tyler Loakman, Joseph James, and Chenghua Lin. 2025. Seeing isn’t Hearing: Benchmarking Vision Language Models at Interpreting Spectrograms. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 79–86, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.