@inproceedings{backer-hyman-2025-bootstrapping,
title = "Bootstrapping {AI}: Interdisciplinary Approaches to Assessing {OCR} Quality in {E}nglish-Language Historical Documents",
author = "Backer, Samuel and
Hyman, Louis",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
{\"O}hman, Emily and
Bizzoni, Yuri and
Miyagawa, So and
Alnajjar, Khalid},
booktitle = "Proceedings of the 5th International Conference on Natural Language Processing for Digital Humanities",
month = may,
year = "2025",
address = "Albuquerque, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.21/",
pages = "251--256",
ISBN = "979-8-89176-234-3",
abstract = "New LLM-based OCR and post-OCR correction methods promise to transform computational historical research, yet their efficacy remains contested. We compare multiple correction approaches, including methods for ``bootstrapping'' fine-tuning with LLM-generated data, and measure their effect on downstream tasks. Our results suggest that standard OCR metrics often underestimate performance gains for historical research, underscoring the need for discipline-driven evaluations that can better reflect the needs of computational humanists."
}
Markdown (Informal)
[Bootstrapping AI: Interdisciplinary Approaches to Assessing OCR Quality in English-Language Historical Documents](https://preview.aclanthology.org/fix-sig-urls/2025.nlp4dh-1.21/) (Backer & Hyman, NLP4DH 2025)
ACL