@inproceedings{booth-etal-2024-bln600,
title = "{BLN}600: A Parallel Corpus of Machine/Human Transcribed Nineteenth Century Newspaper Texts",
author = "Booth, Callum William and
Thomas, Alan and
Gaizauskas, Robert",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.219/",
pages = "2440--2446",
abstract = "We present a publicly available corpus of nineteenth-century newspaper text focused on crime in London, derived from the Gale British Library Newspapers corpus parts 1 and 2. The corpus comprises 600 newspaper excerpts and for each excerpt contains the original source image, the machine transcription of that image as found in the BLN and a gold standard manual transcription that we have created. We envisage the corpus will be helpful for the training and development of OCR and post-OCR correction methodologies for historical newspaper machine transcription{---}for which there is currently a dearth of publicly available resources. In this paper, we discuss the rationale behind gathering such a corpus, the methodology used to select, process, and align the data, and the corpus' potential utility for historians and digital humanities researchers{---}particularly within the realms of neural machine translation-based post-OCR correction approaches, and other natural language processing tasks that are critically affected by erroneous OCR."
}
Markdown (Informal)
[BLN600: A Parallel Corpus of Machine/Human Transcribed Nineteenth Century Newspaper Texts](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.219/) (Booth et al., LREC-COLING 2024)
ACL