@inproceedings{pong-2026-towards,
title = "Towards Dynamic Attention Masking for Simultaneous Speech Translation",
author = "Pong, Benjamin",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2026-06/2026.iwslt-1.20/",
doi = "10.18653/v1/2026.iwslt-1.20",
pages = "183--188",
ISBN = "979-8-89176-411-8",
abstract = "We present a proof-of-concept system for simultaneous speech translation based on dynamic attention masking. Our approach builds on SeamlessM4T by injecting lightweight per-layer schedulers into the conformer-encoder, training each scheduler to predict the number of future frames needed for translation. The schedulers are trained jointly with LoRA adapters across three language directions: English to German, Italian, and Chinese. At inference time, we evaluate our system using sliding window retranslation inference regime (Sen et al., 2022), and an adapted version of StreamAtt (Papi et al., 2024) that replaces the fixed cutoff with a content-aware threshold derived from the learnt representations from the scheduler outputs."
}Markdown (Informal)
[Towards Dynamic Attention Masking for Simultaneous Speech Translation](https://preview.aclanthology.org/corrections-2026-06/2026.iwslt-1.20/) (Pong, IWSLT 2026)
ACL