@inproceedings{chen-etal-2026-toward,
title = "Toward a Coarse-Labeled Spoken Language Identification Dataset for Central Alaskan Yup{'}ik and {S}amoan from {US} Broadcast Archives",
author = "Chen, Yangyang and
Rim, Kyeongmin and
Pustejovsky, James",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.18/",
pages = "203--211",
ISBN = "979-8-89176-415-6",
abstract = "Publicly available spoken language identification (LID) systems provide sparse and inconsistent coverage of indigenous languages of the Americas and languages of the Pacific Islands. No system on HuggingFace covers Central Alaskan Yup{'}ik except the largest variant of Meta{'}s MMS-LID family, and only three MMS-LID variants cover Samoan, while Whisper and VoxLingua107-based models lack both despite including other Polynesian languages. We describe an ongoing effort to build a coarse-labeled LID dataset for Yup{'}ik and Samoan from US public broadcast archives, benchmark publicly available LID systems on it, and train a simple MLP classifier on frozen wav2vec{\textasciitilde}2.0 representations as a prototype. We report preliminary corpus statistics, off-the-shelf model performance, and prototype results. Guided by the distinctive phonological typology of the target languages, we outline a phonologically-informed fine-tuning direction as future work."
}Markdown (Informal)
[Toward a Coarse-Labeled Spoken Language Identification Dataset for Central Alaskan Yup’ik and Samoan from US Broadcast Archives](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.18/) (Chen et al., AmericasNLP 2026)
ACL