@inproceedings{guo-wei-2026-ocr,
title = "From {OCR} to Analysis: Tracking Correction Provenance in Digital Humanities Pipelines",
author = "Guo, Haoze and
Wei, Ziqi",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.1/",
pages = "1--12",
ISBN = "979-8-89176-427-9",
abstract = "Optical Character Recognition (OCR) is a critical but error-prone stage in digital humanities text pipelines. While OCR correction improves usability for downstream NLP tasks, common workflows often overwrite intermediate decisions, obscuring how textual transformations affect scholarly interpretation. We present a provenance-aware framework for OCR-corrected humanities corpora that records correction lineage at the span level, including edit type, correction source, confidence, and revision status. Using a pilot corpus of historical texts, we compare downstream named entity extraction across raw OCR, fully corrected text, and provenance-filtered corrections. Our results show that correction pathways can substantially alter extracted entities and document-level interpretations, while provenance signals help identify unstable outputs and prioritize human review. We argue that provenance should be treated as a first-class analytical layer in NLP for digital humanities, supporting reproducibility, source criticism, and uncertainty-aware interpretation."
}Markdown (Informal)
[From OCR to Analysis: Tracking Correction Provenance in Digital Humanities Pipelines](https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.1/) (Guo & Wei, NLP4DH 2026)
ACL