@inproceedings{gessler-haynes-2026-corsal,
title = "{C}o{RSAL}-{OCR}: Evaluating Zero-Shot {OCR} for Language Archive Materials",
author = "Gessler, Luke and
Haynes, Andrew",
editor = "Agyapong, Godfred and
Moeller, Sarah and
Arppe, Antti and
Marashian, Ali and
Rosenblum, Daisy",
booktitle = "Proceedings of the Ninth Workshop on the Use of Computational Methods in the Study of Endangered Languages ({C}omput{EL}-9)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.computel-1.14/",
pages = "125--135",
ISBN = "979-8-89176-422-4",
abstract = "Language archives contain valuable linguistic materials that are undigitized and therefore difficult to access. Modern optical character recognition (OCR) systems have great potential to make these collections more accessible, but there are few system evaluations which can assess the quality of an OCR system specifically for language archive materials. We present CoRSAL-OCR, an OCR evaluation dataset of over 200 document pages with gold-standard transcriptions from two South Asian languages: Bodo (written in Devanagari) and Garo (written in Latin script). Using this dataset together with the 8-language AILLA-OCR benchmark, we evaluate four OCR systems: Tesseract, Google Cloud Vision, Gemini 3 Flash, and Qwen3.5-27B (an open-weight model). We find that vision language models (VLMs), when given appropriate prompts, achieve the lowest error rates on these datasets. However, prompt design has a large effect on VLM performance, with a detailed generic prompt reducing CER by up to six-fold compared to a minimal prompt. We release our dataset at https://github.com/larc-iu/corsal-ocr to support further research on OCR for language archives."
}Markdown (Informal)
[CoRSAL-OCR: Evaluating Zero-Shot OCR for Language Archive Materials](https://preview.aclanthology.org/ingest-acl-workshops/2026.computel-1.14/) (Gessler & Haynes, ComputEL 2026)
ACL