@inproceedings{takeda-komatani-2026-retrospective,
title = "Retrospective Speech Recognition for Spoken Dialogue System: Exploiting Subsequent Utterances to Enhance {ASR} Performance",
author = "Takeda, Ryu and
Komatani, Kazunori",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/dashboard-stats/2026.iwsds-1.20/",
pages = "182--192",
abstract = "Spoken dialogue systems would benefit from the ability of self-correction, namely, {--}revising earlier recognition results once later utterances are available, as humans often do in dialogue. However, conventional automatic speech recognition ({ASR}) frameworks mainly process user utterances sequentially and rely only on the preceding context. To address this limitation, we propose Retrospective Speech Recognition ({RSR}), which refines past recognition results by exploiting its subsequent utterances. We formulate and implement an {RSR} model for a dialogue system situation where system utterances can also be utilized. Each past user utterance is processed with an interpretable syllabogram representation, which integrates preceding and subsequent utterances within a shared domain between the signal and text levels. This intermediate representation also helps reduce orthographic inconsistencies. Experimental results using real {J}apanese dialogue speech showed that utilizing the subsequent utterances improved the character error rate by 0.10 points, which demonstrates the utility of {RSR}. We also investigated the impact of other factors, such as utilization of system utterances."
}