@inproceedings{agarwal-anastasopoulos-2025-ailla,
title = "{AILLA}-{OCR}: A First Textual and Structural Post-{OCR} Dataset for 8 Indigenous Languages of {L}atin {A}merica",
author = "Agarwal, Milind and
Anastasopoulos, Antonios",
editor = "Lachler, Jordan and
Agyapong, Godfred and
Arppe, Antti and
Moeller, Sarah and
Chaudhary, Aditi and
Rijhwani, Shruti and
Rosenblum, Daisy",
booktitle = "Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = mar,
year = "2025",
address = "Honolulu, Hawaii, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/bulk-corrections-2025-11-25/2025.computel-main.13/",
pages = "120--127",
abstract = "It is by now common knowledge in the NLP community that low-resource languages need large-scale data creation efforts and novel contributions in the form of robust algorithms that work in data-scarce settings. Amongst these languages, however, many have a large amount of data, ripe for NLP applications, except that this data exists in image-based formats. This includes scanned copies of extremely valuable dictionaries, linguistic field notes, children{'}s stories, plays, and other textual material. To extract the text data from these non machine-readable images, Optical Character Recognition (OCR) is the most popular technique, but it has proven to be challenging for low-resource languages because of their unique properties (uncommon diacritics, rare words etc.) and due to a general lack of preserved page-structure in the OCR output. So, to contribute to the reduction of these two big bottlenecks (lack of text data and layout quality), we release the first textual and structural OCR dataset for 8 indigenous languages of Latin America. We hope that our dataset will encourage researchers within the NLP and Computational Linguistics communities to work with these languages."
}Markdown (Informal)
[AILLA-OCR: A First Textual and Structural Post-OCR Dataset for 8 Indigenous Languages of Latin America](https://preview.aclanthology.org/bulk-corrections-2025-11-25/2025.computel-main.13/) (Agarwal & Anastasopoulos, ComputEL 2025)
ACL