@inproceedings{kriukova-etal-2026-data,
title = {A data-centric approach to performance improvement in under-resourced {ASR}: The case of D{\"e}n{\"e} S{\k{u}}{\l}{\i}n{\'e}},
author = "Kriukova, Olga and
Lovick, Olga and
Arppe, Antti",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.9/",
pages = "95--106",
ISBN = "979-8-89176-415-6",
abstract = {This paper presents a study focused on advancing Automatic Speech Recognition (ASR) for the under-resourced language D{\"e}n{\"e} S{\k{u}}{\l}{\i}n{\'e} through data-centric approaches. We explore multiple strategies to enhance the quality of training data{---}both audio recordings and transcriptions{---}to address the challenges posed by mixed-quality datasets. Our experiments investigate which data preparation techniques most effectively improve ASR performance in this context. Our findings show that reducing non-phonemic spelling variation in the corpus significantly improves model generalization, resulting in a substantial increase in recognition accuracy. Additionally, we demonstrate that increasing manually reviewed transcriptions consistently improves word and character error rates, while audio enhancement slightly reduces performance, highlighting the complex trade-offs in low-resource ASR development.}
}Markdown (Informal)
[A data-centric approach to performance improvement in under-resourced ASR: The case of Dënë Sųłıné](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.9/) (Kriukova et al., AmericasNLP 2026)
ACL