@inproceedings{akti-waibel-2026-kits,
title = "{KIT}{'}s Submission to Cross-Lingual Voice Cloning in {IWSLT} 2026",
author = "Akti, Seymanur and
Waibel, Alexander",
editor = "Salesky, Elizabeth and
Anastasopoulos, Antonios and
Negri, Matteo and
Federico, Marcello",
booktitle = "Proceedings of the 23rd International Conference on Spoken Language Translation ({IWSLT} 2026)",
month = jul,
year = "2026",
address = "San Diego, USA (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.iwslt-1.8/",
doi = "10.18653/v1/2026.iwslt-1.8",
pages = "78--83",
ISBN = "979-8-89176-411-8",
abstract = "Cross-lingual voice cloning aims to generate speech in a target language while preserving speaker identity from a source-language reference. This task is central to speech translation and is the focus of the IWSLT 2026 Cross-Lingual Voice Cloning track. A key challenge is maintaining intelligibility and naturalness in the presence of accent variation and domain-specific vocabulary. We build on a multilingual text-to-speech model, FishAudio-S2-Pro, and introduce language tag prompting to improve language control and reduce accent leakage. We further apply reinforcement learning (RL) fine-tuning for task adaptation and observe improvements in intelligibility. Finally, we propose a reference-conditioned lexical matching method that improves pronunciation of domain-specific terms when lexical overlap is present. Results show that language prompting provides the largest gains, while lexical matching yields consistent improvements on matched subsets."
}Markdown (Informal)
[KIT’s Submission to Cross-Lingual Voice Cloning in IWSLT 2026](https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.iwslt-1.8/) (Akti & Waibel, IWSLT 2026)
ACL