@inproceedings{goyal-garera-2023-building,
title = "Building Accurate Low Latency {ASR} for Streaming Voice Search in {E}-commerce",
author = "Goyal, Abhinav and
Garera, Nikesh",
editor = "Sitaram, Sunayana and
Beigman Klebanov, Beata and
Williams, Jason D",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 5: Industry Track)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.acl-industry.26/",
doi = "10.18653/v1/2023.acl-industry.26",
pages = "276--283",
abstract = "Automatic Speech Recognition (ASR) is essential for any voice-based application. The streaming capability of ASR becomes necessary to provide immediate feedback to the user in applications like Voice Search. LSTM/RNN and CTC based ASR systems are very simple to train and deploy for low latency streaming applications but have lower accuracy when compared to the state-of-the-art models. In this work, we build accurate LSTM, attention and CTC based streaming ASR models for large-scale Hinglish (blend of Hindi and English) Voice Search. We evaluate how various modifications in vanilla LSTM training improve the system`s accuracy while preserving the streaming capabilities. We also discuss a simple integration of end-of-speech (EOS) detection with CTC models, which helps reduce the overall search latency. Our model achieves a word error rate (WER) of 3.69{\%} without EOS and 4.78{\%} with EOS, with {\textasciitilde}1300 ms ({\textasciitilde}46.64{\%}) reduction in latency."
}
Markdown (Informal)
[Building Accurate Low Latency ASR for Streaming Voice Search in E-commerce](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.acl-industry.26/) (Goyal & Garera, ACL 2023)
ACL