@inproceedings{liu-etal-2026-unlocking,
title = "Unlocking Large Audio-Language Models for Interactive Language Learning",
author = "Liu, Hongfu and
Cui, Zhouying and
Gu, Xiangming and
Wang, Ye",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.190/",
pages = "3667--3690",
ISBN = "979-8-89176-386-9",
abstract = "Achieving pronunciation proficiency in a second language (L2) remains a challenge, despite the development of Computer-Assisted Pronunciation Training (CAPT) systems. Traditional CAPT systems often provide unintuitive feedback that lacks actionable guidance, limiting its effectiveness. Recent advancements in audio-language models (ALMs) offer the potential to enhance these systems by providing more user-friendly feedback. In this work, we investigate ALMs for chat-based pronunciation training by introducing \textbf{L2-Arctic-plus}, an English dataset with detailed error explanations and actionable suggestions for improvement. We benchmark cascaded ASR+LLMs and existing ALMs on this dataset, specifically in detecting mispronunciation and generating actionable feedback. To improve the performance, we further propose to instruction-tune ALMs on L2-Arctic-plus. Experimental results demonstrate that our instruction-tuned models significantly outperform existing baselines on mispronunciation detection and suggestion generation in terms of both objective and human evaluation, highlighting the value of the proposed dataset."
}Markdown (Informal)
[Unlocking Large Audio-Language Models for Interactive Language Learning](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.190/) (Liu et al., Findings 2026)
ACL