@inproceedings{p-mamidi-2024-towards,
title = "Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models",
author = "P, Rahothvarman and
Mamidi, Radhika",
editor = "Lalitha Devi, Sobha and
Arora, Karunesh",
booktitle = "Proceedings of the 21st International Conference on Natural Language Processing (ICON)",
month = dec,
year = "2024",
address = "AU-KBC Research Centre, Chennai, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.icon-1.31/",
pages = "264--268",
abstract = "Open Vocabulary Keyword Spotting is essential in numerous applications, from virtual assistants to security systems, as it allows systems to identify specific words or phrases in continuous speech. In this paper, we propose a novel end-to-end method for detecting user-defined open vocabulary keywords by leveraging linguistic patterns for the correlation between audio and text modalities. Our approach utilizes quantized pre-trained foundation models for robust audio embeddings and a unique lightweight Multi-Scale Linear Attention (MSLA) network that aligns speech and text representations for effective cross-modal agreement. We evaluate our method on two distinct datasets, comparing its performance against other baselines. The results highlight the effectiveness of our approach, achieving significant improvements over the Cross-Modality Correspondence Detector (CMCD) method, with a 16.08{\%} increase in AUC and a 17.2{\%} reduction in EER metrics on the Google Speech Commands dataset. These findings demonstrate the potential of our method to advance keyword spotting across various real-world applications."
}
Markdown (Informal)
[Towards Efficient Audio-Text Keyword Spotting: Quantization and Multi-Scale Linear Attention with Foundation Models](https://preview.aclanthology.org/fix-sig-urls/2024.icon-1.31/) (P & Mamidi, ICON 2024)
ACL