@inproceedings{won-etal-2025-end,
title = "End-to-End Multilingual Automatic Dubbing via Duration-based Translation with Large Language Models",
author = "Won, Hyun-Sik and
Jeong, DongJin and
Choi, Hyunkyu and
Kim, Jinwon",
editor = {Habernal, Ivan and
Schulam, Peter and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-demos.37/",
pages = "515--521",
ISBN = "979-8-89176-334-0",
abstract = "Automatic dubbing (AD) aims to replace the original speech in a video with translated speech that maintains precise temporal alignment (isochrony). Achieving natural synchronization between dubbed speech and visual content remains challenging due to variations in speech durations across languages. To address this, we propose an end-to-end AD framework that leverages large language models (LLMs) to integrate translation and timing control seamlessly. At the core of our framework lies Duration-based Translation (DT), a method that dynamically predicts the optimal phoneme count based on source speech duration and iteratively adjusts the translation length accordingly. Our experiments on English, Spanish, and Korean language pairs demonstrate that our approach substantially improves speech overlap{---}achieving up to 24{\%} relative gains compared to translations without explicit length constraints{---}while maintaining competitive translation quality measured by COMET scores. Furthermore, our framework does not require language-specific tuning, ensuring practicality for multilingual dubbing scenarios."
}Markdown (Informal)
[End-to-End Multilingual Automatic Dubbing via Duration-based Translation with Large Language Models](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-demos.37/) (Won et al., EMNLP 2025)
ACL