@inproceedings{huo-etal-2026-tagspeech,
title = "{T}ag{S}peech: End-to-End Multi-Speaker {ASR} and Diarization with Fine-Grained Temporal Grounding",
author = "Huo, Mingyue and
Shao, Yiwen and
Zhang, Yuheng",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1938/",
pages = "41847--41862",
ISBN = "979-8-89176-390-6",
abstract = "We present TagSpeech, a unified LLM-based framework that utilizes Temporal Anchor Grounding for joint multi-speaker ASR and diarization. The framework is built on two key designs: (1) decoupled semantic and speaker streams fine-tuned via Serialized Output Training (SOT) to learn turn-taking dynamics; and (2) an interleaved time anchor mechanism that not only supports fine-grained timestamp prediction but also acts as a synchronization signal between semantic understanding and speaker tracking. Compared to previous works that primarily focus on speaker-attributed ASR or implicit diarization, TagSpeech addresses the challenge of fine-grained speaker-content alignment and explicitly models who spoke what and when in an end-to-end manner. Experiments on AMI and AliMeeting benchmarks demonstrate that our method achieves consistent improvements in Diarization Error Rate (DER) over strong end-to-end baselines, including Qwen-Omni and Gemini, particularly in handling complex speech overlaps. Moreover, TagSpeech employs a parameter-efficient training paradigm in which the LLM backbone is frozen and only lightweight projectors are trained, resulting in strong performance with low computational cost"
}Markdown (Informal)
[TagSpeech: End-to-End Multi-Speaker ASR and Diarization with Fine-Grained Temporal Grounding](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1938/) (Huo et al., ACL 2026)
ACL