@inproceedings{mehmood-abdul-rauf-2025-human,
title = "Human-Evaluated {U}rdu-{E}nglish Speech Corpus: Advancing Speech-to-Text for Low-Resource Languages",
author = "Mehmood, Humaira and
Abdul Rauf, Sadaf",
editor = "Salesky, Elizabeth and
Federico, Marcello and
Anastasopoulos, Antonis",
booktitle = "Proceedings of the 22nd International Conference on Spoken Language Translation (IWSLT 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria (in-person and online)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.iwslt-1.12/",
pages = "138--144",
ISBN = "979-8-89176-272-5",
abstract = "This paper presents our contribution to the IWSLT Low Resource Track 2: `Training and Evaluation Data Track'. We share a human-evaluated Urdu-English speech-to-text corpus based on Common Voice 13.0 Urdu speech corpus. We followed a three-tier validation scheme which involves an initial automatic translation with corrections from native reviewers, full review by evaluators followed by final validation from a bilingual expert ensuring reliable corpus for subsequent NLP tasks. Our contribution, CV-UrEnST corpus, enriches Urdu speech resources by contributing the first Urdu-English speech-to-text corpus. When evaluated with Whisper-medium, the corpus yielded a significant improvement to the vanilla model in terms of BLEU, chrF++, and COMET scores, demonstrating its effectiveness for speech translation tasks."
}
Markdown (Informal)
[Human-Evaluated Urdu-English Speech Corpus: Advancing Speech-to-Text for Low-Resource Languages](https://preview.aclanthology.org/landing_page/2025.iwslt-1.12/) (Mehmood & Abdul Rauf, IWSLT 2025)
ACL