@inproceedings{yang-etal-2025-large-language,
title = "When Large Language Models Meet Speech: A Survey on Integration Approaches",
author = "Yang, Zhengdong and
Shimizu, Shuichiro and
Yu, Yahan and
Chu, Chenhui",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1041/",
pages = "20298--20315",
ISBN = "979-8-89176-256-5",
abstract = "Recent advancements in large language models (LLMs) have spurred interest in expanding their application beyond text-based tasks. A large number of studies have explored integrating other modalities with LLMs, notably speech modality, which is naturally related to text. This paper surveys the integration of speech with LLMs, categorizing the methodologies into three primary approaches: text-based, latent-representation-based, and audio-token-based integration. We also demonstrate how these methods are applied across various speech-related applications and highlight the challenges in this field to offer inspiration for future research."
}
Markdown (Informal)
[When Large Language Models Meet Speech: A Survey on Integration Approaches](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1041/) (Yang et al., Findings 2025)
ACL