@article{pasad-etal-2024-self,
title = "What Do Self-Supervised Speech Models Know About Words?",
author = "Pasad, Ankita and
Chien, Chung-Ming and
Settle, Shane and
Livescu, Karen",
journal = "Transactions of the Association for Computational Linguistics",
volume = "12",
year = "2024",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.tacl-1.21/",
doi = "10.1162/tacl_a_00656",
pages = "372--391",
abstract = "Many self-supervised speech models (S3Ms) have been introduced over the last few years, improving performance and data efficiency on various speech tasks. However, these empirical successes alone do not give a complete picture of what is learned during pre-training. Recent work has begun analyzing how S3Ms encode certain properties, such as phonetic and speaker information, but we still lack a proper understanding of knowledge encoded at the word level and beyond. In this work, we use lightweight analysis methods to study segment-level linguistic properties{---}word identity, boundaries, pronunciation, syntactic features, and semantic features{---}encoded in S3Ms. We present a comparative study of layer-wise representations from ten S3Ms and find that (i) the frame-level representations within each word segment are not all equally informative, and (ii) the pre-training objective and model size heavily influence the accessibility and distribution of linguistic information across layers. We also find that on several tasks{---}word discrimination, word segmentation, and semantic sentence similarity{---}S3Ms trained with visual grounding outperform their speech-only counterparts. Finally, our task-based analyses demonstrate improved performance on word segmentation and acoustic word discrimination while using simpler methods than prior work.1"
}
Markdown (Informal)
[What Do Self-Supervised Speech Models Know About Words?](https://preview.aclanthology.org/fix-sig-urls/2024.tacl-1.21/) (Pasad et al., TACL 2024)
ACL