@inproceedings{hu-etal-2026-tellwhisper,
title = "{T}ell{W}hisper: Tell Whisper Who Speaks When",
author = "Hu, Yifan and
Yang, Peiji and
Wang, Zhisheng and
Zhong, Yicheng and
Liu, Rui",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.861/",
pages = "18884--18898",
ISBN = "979-8-89176-390-6",
abstract = "Multi-speaker automatic speech recognition (MASR) aims to predict ``who spoke when and what'' from multi-speaker speech, a key technology for multi-party dialogue understanding. However, most existing approaches decouple temporal modeling and speaker modeling when addressing ``when'' and ``who'': some inject speaker cues before encoding (e.g., speaker masking), which can cause irreversible information loss; others fuse identity by mixing speaker posteriors after encoding, which may entangle acoustic content with speaker identity. This separation is brittle under rapid turn-taking and overlapping speech, often leading to degraded performance. To address these limitations, we propose $TellWhisper$, a unified framework that jointly models speaker identity and temporal within the speech encoder. Specifically, we design $TS\text{-}RoPE$, a time-speaker rotary positional encoding: time coordinates are derived from frame indices, while speaker coordinates are derived from speaker activity and pause cues. By applying region-specific rotation angles, the model explicitly captures per-speaker continuity, speaker-turn transitions, and state dynamics, enabling the attention mechanism to simultaneously attend to ``when'' and ``who''. Moreover, to estimate frame-level speaker activity, we develop $Hyper\text{-}SD$, which casts speaker classification in hyperbolic space to enhance inter-class separation and refine speaker-activity estimates. Extensive experiments demonstrate the effectiveness of the proposed approach. The project webpage is available at https://walker-hyf.github.io/TellWhisper."
}Markdown (Informal)
[TellWhisper: Tell Whisper Who Speaks When](https://preview.aclanthology.org/ingest-acl/2026.acl-long.861/) (Hu et al., ACL 2026)
ACL
- Yifan Hu, Peiji Yang, Zhisheng Wang, Yicheng Zhong, and Rui Liu. 2026. TellWhisper: Tell Whisper Who Speaks When. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 18884–18898, San Diego, California, United States. Association for Computational Linguistics.