@inproceedings{xue-etal-2026-practical,
title = "A Practical Evaluation Method for Long-Form Simultaneous Speech-to-Speech Translation",
author = "Xue, Yulin and
Ouyang, Siqi and
Li, Lei",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.iwslt-1.3/",
doi = "10.18653/v1/2026.iwslt-1.3",
pages = "32--39",
ISBN = "979-8-89176-411-8",
abstract = "Simultaneous speech-to-speech translation (SimulS2ST) enables real-time cross-lingual communication, but existing evaluation has focused largely on short or pre-segmented speech rather than long-form, continuous input. Prior approaches are difficult to reproduce and make assumptions that do not hold for end-to-end systems. We present a practical evaluation method for long-form SimulS2ST. Given source speech, pre-segmented source transcripts, and reference translations, we run automatic speech recognition (ASR) and forced alignment on the generated target speech to recover token-level timestamps, then apply a sentence-embedding-based aligner to match the target text to its corresponding source sentences. This enables sentence-level computation of latency and quality metrics, including YAAL and xCOMET, which are then aggregated into final system-level scores. Experiments on representative SimulS2ST systems show that the method is effective in practice and reveal that current systems suffer from substantial latency accumulation on long speech."
}Markdown (Informal)
[A Practical Evaluation Method for Long-Form Simultaneous Speech-to-Speech Translation](https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.iwslt-1.3/) (Xue et al., IWSLT 2026)
ACL