@inproceedings{agarwal-anastasopoulos-2025-ailla,
title = "{AILLA}-{OCR}: A First Textual and Structural Post-{OCR} Dataset for 8 Indigenous Languages of {L}atin {A}merica",
author = "Agarwal, Milind and
Anastasopoulos, Antonios",
editor = "Lachler, Jordan and
Agyapong, Godfred and
Arppe, Antti and
Moeller, Sarah and
Chaudhary, Aditi and
Rijhwani, Shruti and
Rosenblum, Daisy",
booktitle = "Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = mar,
year = "2025",
address = "Honolulu, Hawaii, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.13/",
pages = "120--127",
ISBN = "None",
abstract = "It is by now common knowledge in the NLP community that low-resource languages need large-scale data creation efforts and novel con- tributions in the form of robust algorithms that work in data-scarce settings. Amongst these languages, however, many have a large amount of data, ripe for NLP applications, except that this data exists in image-based formats. This includes scanned copies of extremely valuable dictionaries, linguistic field notes, children`s stories, plays, and other textual material. To extract the text data from these non machine- readable images, Optical Character Recogni- tion (OCR) is the most popular technique, but it has proven to be challenging for low-resource languages because of their unique properties (uncommon diacritics, rare words etc.) and due to a general lack of preserved page-structure in the OCR output. So, to contribute to the reduc- tion of these two big bottlenecks (lack of text data and layout quality), we release the first textual and structural OCR dataset for 8 indige- nous languages of Latin America. We hope that our dataset will encourage researchers within the NLP and Computational Linguistics com- munities to work with these languages."
}
Markdown (Informal)
[AILLA-OCR: A First Textual and Structural Post-OCR Dataset for 8 Indigenous Languages of Latin America](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.13/) (Agarwal & Anastasopoulos, ComputEL 2025)
ACL