@inproceedings{peng-etal-2025-voicetextblender, title = "{V}oice{T}ext{B}lender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning", author = "Peng, Yifan and Puvvada, Krishna C and Chen, Zhehuai and Zelasko, Piotr and Huang, He and Dhawan, Kunal and Hu, Ke and Watanabe, Shinji and Balam, Jagadeesh and Ginsburg, Boris", editor = "Chiruzzo, Luis and Ritter, Alan and Wang, Lu", booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)", month = apr, year = "2025", address = "Albuquerque, New Mexico", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.298/", pages = "5787--5802", ISBN = "979-8-89176-189-6" }