@inproceedings{huaute-brixey-2026-towards,
title = "Towards a Community-accessible {C}ahuilla corpus: Developing {HTR} for {J}.{P}. Harrington{'}s handwritten fieldnotes on Mountain {C}ahuilla",
author = "Huaute, Ray and
Brixey, Jacqueline",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.10/",
pages = "107--114",
ISBN = "979-8-89176-415-6",
abstract = "This paper describes ongoing work to develop a corpus of Cahuilla language from the John Peabody Harrington collection, which contains linguistic and ethnographic fieldnotes documenting Indigenous languages of California and other regions across the Americas. Handwritten notes present numerous processing challenges, including scratch-outs, multilingual entries in Spanish and other Indigenous languages, unique abbreviations, and varying script orientations. We compare the efficacy of deep learning text recognition models to convert images of the notes into a machine-readable format, with a focus on respecting tribal data sovereignty in our methods. We find that Pylaia is the most accurate model for our data. Finally, we present the preliminary findings and indicate future directions for developing a Cahuilla corpus."
}Markdown (Informal)
[Towards a Community-accessible Cahuilla corpus: Developing HTR for J.P. Harrington’s handwritten fieldnotes on Mountain Cahuilla](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.10/) (Huaute & Brixey, AmericasNLP 2026)
ACL