@inproceedings{peng-etal-2025-voicetextblender,
    title = "{V}oice{T}ext{B}lender: Augmenting Large Language Models with Speech Capabilities via Single-Stage Joint Speech-Text Supervised Fine-Tuning",
    author = "Peng, Yifan  and
      Puvvada, Krishna C  and
      Chen, Zhehuai  and
      Zelasko, Piotr  and
      Huang, He  and
      Dhawan, Kunal  and
      Hu, Ke  and
      Watanabe, Shinji  and
      Balam, Jagadeesh  and
      Ginsburg, Boris",
    editor = "Chiruzzo, Luis  and
      Ritter, Alan  and
      Wang, Lu",
    booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
    month = apr,
    year = "2025",
    address = "Albuquerque, New Mexico",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.298/",
    pages = "5787--5802",
    ISBN = "979-8-89176-189-6"
}