@inproceedings{agarwal-etal-2025-developing,
title = "Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak`wala Legacy Texts",
author = "Agarwal, Milind and
Anastasopoulos, Antonios and
Rosenblum, Daisy",
editor = "Lachler, Jordan and
Agyapong, Godfred and
Arppe, Antti and
Moeller, Sarah and
Chaudhary, Aditi and
Rijhwani, Shruti and
Rosenblum, Daisy",
booktitle = "Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = mar,
year = "2025",
address = "Honolulu, Hawaii, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.15/",
pages = "133--138",
ISBN = "None",
abstract = "Kwak`wala is an Indigenous language spoken in British Columbia, with a rich legacy of pub- lished documentation spanning more than a century, and an active community of speakers, teachers, and learners engaged in language revi- talization. Over 11 volumes of the earliest texts created during the collaboration between Franz Boas and George Hunt have been scanned but remain unreadable by machines. Complete dig- itization through optical character recognition has the potential to facilitate transliteration into modern orthographies and the creation of other language technologies. In this paper, we ap- ply the latest OCR techniques to a series of Kwak`wala texts only accessible as images, and discuss the challenges and unique adaptations necessary to make such technologies work for these real-world texts. Building on previous methods, we propose using a mix of off-the- shelf OCR methods, language identification, and masking to effectively isolate Kwak`wala text, along with post-correction models, to pro- duce a final high-quality transcription."
}
Markdown (Informal)
[Developing a Mixed-Methods Pipeline for Community-Oriented Digitization of Kwak’wala Legacy Texts](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.15/) (Agarwal et al., ComputEL 2025)
ACL