@inproceedings{daul-etal-2026-linguistically,
title = "Linguistically Informed Tokenization Improves {ASR} for Underresourced Languages",
author = "Daul, Massimo Marie and
Tosolini, Alessio and
Bowern, Claire",
booktitle = "Proceedings of the Fifth Workshop on {NLP} Applications to Field Linguistics",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.fieldmatters-1.4/",
pages = "31--37",
abstract = "Automatic speech recognition (ASR) is a crucial tool for linguists aiming to perform a variety of language documentation tasks. However, modern ASR systems rely on data-hungry transformer architectures, rendering them generally unusable for underresourced languages. We fine-tune a wav2vec 2.0 ASR model on Yanyhangu, an Indigenous Australian language, comparing the effects of phonemic and orthographic tokenization strategies on performance. In parallel, we explore ASR{'}s viability as a tool in a language documentation pipeline. We find that a linguistically informed phonemic tokenization system substantially improves word error rate (WER) and character error rate (CER) compared to a baseline orthographic tokenization scheme. Finally, we show that hand-correcting the output of an ASR model is much faster than hand-transcribing audio from scratch, demonstrating that ASR can provide significant assistance for underresourced language documentation."
}Markdown (Informal)
[Linguistically Informed Tokenization Improves ASR for Underresourced Languages](https://preview.aclanthology.org/manual-author-scripts/2026.fieldmatters-1.4/) (Daul et al., FieldMatters 2026)
ACL