@inproceedings{yamamoto-etal-2025-analysis,
title = "Analysis of Voice Activity Detection Errors in {API}-based Streaming {ASR} for Human-Robot Dialogue",
author = "Yamamoto, Kenta and
Takeda, Ryu and
Komatani, Kazunori",
editor = "Torres, Maria Ines and
Matsuda, Yuki and
Callejas, Zoraida and
del Pozo, Arantza and
D'Haro, Luis Fernando",
booktitle = "Proceedings of the 15th International Workshop on Spoken Dialogue Systems Technology",
month = may,
year = "2025",
address = "Bilbao, Spain",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.iwsds-1.26/",
pages = "245--253",
ISBN = "979-8-89176-248-0",
abstract = "In human-robot dialogue systems, streaming automatic speech recognition (ASR) services (e.g., Google ASR) are often utilized, with the microphone positioned close to the robot{'}s loudspeaker. Under these conditions, both the robot{'}s and the user{'}s utterances are captured, resulting in frequent failures to detect user speech. This study analyzes voice activity detection (VAD) errors by comparing results from such streaming ASR to those from standalone VAD models. Experiments conducted on three distinct dialogue datasets showed that streaming ASR tends to ignore user utterances immediately following system utterances. We discuss the underlying causes of these VAD errors and provide recommendations for improving VAD performance in human-robot dialogue."
}
Markdown (Informal)
[Analysis of Voice Activity Detection Errors in API-based Streaming ASR for Human-Robot Dialogue](https://preview.aclanthology.org/landing_page/2025.iwsds-1.26/) (Yamamoto et al., IWSDS 2025)
ACL