@inproceedings{tam-etal-2026-context,
title = "The Context Trap: Why End-to-End Audio Language Models Fail Multi-turn Dialogues",
author = "Tam, Zhi Rui and
Chang, Wen Yu and
Chen, Yun-Nung",
editor = "Riccardi, Giuseppe and
Mousavi, Seyed Mahed and
Torres, Maria Ines and
Yoshino, Koichiro and
Callejas, Zoraida and
Chowdhury, Shammur Absar and
Chen, Yun-Nung and
Bechet, Frederic and
Gustafson, Joakim and
Damnati, G{\'e}raldine and
Papangelis, Alex and
D{'}Haro, Luis Fernando and
Mendon{\c{c}}a, John and
Bernardi, Raffaella and
Hakkani-Tur, Dilek and
Di Fabbrizio, Giuseppe {''}Pino{''} and
Kawahara, Tatsuya and
Alam, Firoj and
Tur, Gokhan and
Johnston, Michael",
booktitle = "Proceedings of the 16th International Workshop on Spoken Dialogue System Technology",
month = feb,
year = "2026",
address = "Trento, Italy",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/dashboard-stats/2026.iwsds-1.7/",
pages = "76--82",
abstract = "This study systematically compares end-to-end ({E}2{E}) audio language models ({A}udio{LM}s) against modular ({ASR}, {LLM}, {TTS}) systems for multi-phase task-oriented dialogues. We evaluate open-source models on key metrics: conversational naturalness and dialogue consistency. Our findings show that {E}2{E} configurations consistently underperform their modular counterparts, exhibiting severe degradation in dialogue quality across turns. Investigating this failure, our analysis reveals that the core issue lies in the {E}2{E} models' dialogue modeling capabilities, specifically in context maintenance and topic tracking. This work highlights a critical gap between the purported low-latency benefit of {A}udio{LM}s and their practical ability to maintain coherence in complex, multi-turn dialogues, suggesting a need for focused architectural improvements."
}Markdown (Informal)
[The Context Trap: Why End-to-End Audio Language Models Fail Multi-turn Dialogues](https://preview.aclanthology.org/dashboard-stats/2026.iwsds-1.7/) (Tam et al., IWSDS 2026)
ACL