@inproceedings{kurian-etal-2025-domain,
title = "Domain-Specific Adaptation for {ASR} through Text-Only Fine-Tuning",
author = "Kurian, Betty and
Upadhyay, Abhinav and
Sengupta, Abhijeet",
editor = "Shukla, Ankita and
Kumar, Sandeep and
Bedi, Amrit Singh and
Chakraborty, Tanmoy",
booktitle = "Proceedings of the 1st Workshop on Multimodal Models for Low-Resource Contexts and Social Impact (MMLoSo 2025)",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.7/",
pages = "78--85",
ISBN = "979-8-89176-311-1",
abstract = "Speech recognition models often struggle in specialized domains due to the lack of domain-specific paired audio-text data, making it difficult to adapt general-purpose systems to unique terminology and linguistic patterns. In this work, we propose a text-only domain adaptation method for Whisper, fine-tuning only the decoder using domain-relevant text. Our approach introduces trainable cross-attention bias embeddings, extended with a gated mixture-of-experts routing mechanism, enabling the model to encode domain-specific linguistic priors without any audio data. Unlike ASR adaptation methods that require paired audio-text datasets, our approach is lightweight and resource-efficient. We observe up to a 56{\%} relative improvement in word error rate over the baseline. Our findings demonstrate that text-only adaptation is a practical and effective strategy for improving speech recognition in specialized domains with limited or no domain-specific audio."
}Markdown (Informal)
[Domain-Specific Adaptation for ASR through Text-Only Fine-Tuning](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.mmloso-1.7/) (Kurian et al., MMLoSo 2025)
ACL