@inproceedings{shim-plank-2025-dialetto,
title = "Dialetto, ma Quanto Dialetto? Transcribing and Evaluating Dialects on a Continuum",
author = "Shim, Ryan Soh-Eun and
Plank, Barbara",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.48/",
pages = "838--849",
ISBN = "979-8-89176-195-7",
abstract = "There is increasing interest in looking at dialects in NLP. However, most work to date still treats dialects as discrete categories. For instance, evaluative work in variation-oriented NLP for English often works with Indian English or African-American Venacular English as homogeneous categories, yet even within one variety there is substantial variation. We examine within-dialect variation and show that performance critically varies within categories. We measure speech-to-text performance on Italian dialects, and empirically observe a geographical performance disparity. This disparity correlates substantially (-0.5) with linguistic similarity to the highest performing dialect variety. We cross-examine our results against dialectometry methods, and interpret the performance disparity to be due to a bias towards dialects that are more similar to the standard variety in the speech-to-text model examined. We additionally leverage geostatistical methods to predict zero-shot performance at unseen sites, and find the incorporation of geographical information to substantially improve prediction performance, indicating there to be geographical structure in the performance distribution."
}
Markdown (Informal)
[Dialetto, ma Quanto Dialetto? Transcribing and Evaluating Dialects on a Continuum](https://preview.aclanthology.org/fix-sig-urls/2025.findings-naacl.48/) (Shim & Plank, Findings 2025)
ACL