@inproceedings{bonilla-2026-beyond,
title = "Beyond Acoustics: Isolating Dialectal and Sociolinguistic Bias in {S}panish {ASR}",
author = "Bonilla, Johnatan E.",
editor = "Card, Dallas and
Field, Anjalie and
Keith, Katherine and
Mendelsohn, Julia",
booktitle = "Proceedings of the Seventh Workshop on Natural Language Processing and Computational Social Science",
month = jul,
year = "2026",
address = "San Diego",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.nlpcss-1.8/",
pages = "123--132",
ISBN = "979-8-89176-426-2",
abstract = "Large-scale ASR systems such as Whisper achieve competitive aggregate Word Error Rate (WER) on multilingual benchmarks, but this aggregate conceals systematic disparities across speaker populations. We evaluate Whisper large-v3 on 276 recordings from the \textit{Corpus Oral y Sonoro del Espa{\~n}ol Rural} (COSER), a dialectological archive of elderly rural speakers across all Spanish provinces. WER is computed separately for Informants and Interviewers within each recording, revealing that mixed-role evaluation underestimates Informant WER in the majority of provinces, with the largest corrections in southern areas.Negative Binomial regression with cluster-robust standar errors shows that Andalusia and Extremadura generate significantly more Informant errors than the Castilian heartland (Andalusia IRR = 1.20, $p < 0.001$; Extremadura IRR = 1.24, $p = 0.020$), while no geographic predictor reaches significance for Interviewers sharing the same recording environment. Male Informants generate 12.5{\%} more errors than females after geographic adjustment ($p < 0.001$), consistent with differential vernacular retention in traditional rural communities. The geographic pattern aligns with established dialectological classifications of Peninsular Spanish. These results demonstrate that role-disaggregated evaluation is a necessary methodological prerequisite for fairness audits of ASR systems applied to sociolinguistically diverse corpora: aggregate benchmarks systematically suppress disparities that are borne disproportionately by the most underrepresented speaker populations, and their use in isolation constitutes both an allocative harm and a measurement failure"
}Markdown (Informal)
[Beyond Acoustics: Isolating Dialectal and Sociolinguistic Bias in Spanish ASR](https://preview.aclanthology.org/ingest-acl-workshops/2026.nlpcss-1.8/) (Bonilla, NLP+CSS 2026)
ACL