@inproceedings{chaplynskyi-dydyk-meush-2026-digitizing,
title = "Digitizing Old {U}krainian Texts: A Prompt-Based {OCR} Pipeline and Evaluation Dataset",
author = "Chaplynskyi, Dmytro and
Dydyk-Meush, Hanna",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.unlp-1.7/",
pages = "58--66",
ISBN = "979-8-89176-359-3",
abstract = "We present a methodology and an open dataset for OCR of handwritten index cards containing a scholarly transcription of an early 17th-century Ukrainian polemical text, Perestoroha by Iov Boretskyi (Lviv, 1605{--}1606). The 430 cards, produced by 20th-century researchers, preserve the text in Old Ukrainian orthography with archaic diacritics, titlos, superscript letters, and ligatures that make automated recognition non-trivial. We develop a prompt-based OCR pipeline driven by a custom instruction set designed iteratively from the source material{'}s orthographic conventions. The pipeline is evaluated against human-proofread ground truth in proprietary and open-source configurations using identical instructions and evaluation data. The proprietary configuration with extended thinking at maximum budget (Claude Opus 4.7, xhigh) achieves a Character Error Rate of 2.5{\%}; an Opus 4.6 baseline at the default 2,048-token thinking budget {---} used for the first batch of the released dataset {---} reaches 4.2{\%}; and two open-source Qwen3.6 variants running locally on consumer hardware reach 14.6{\%} (dense 27B) and 14.8{\%} (35B-A3B MoE). We release the fully digitized text aligned at line level to 300 DPI scanned images, as both a scholarly digital resource and training data for future OCR systems targeting Old Slavic manuscripts."
}Markdown (Informal)
[Digitizing Old Ukrainian Texts: A Prompt-Based OCR Pipeline and Evaluation Dataset](https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.unlp-1.7/) (Chaplynskyi & Dydyk-Meush, UNLP 2026)
ACL