@inproceedings{booth-etal-2022-language,
title = "A Language Modelling Approach to Quality Assessment of {OCR}{'}ed Historical Text",
author = "Booth, Callum and
Shoemaker, Robert and
Gaizauskas, Robert",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.lrec-1.630/",
pages = "5859--5864",
abstract = "We hypothesise and evaluate a language model-based approach for scoring the quality of OCR transcriptions in the British Library Newspapers (BLN) corpus parts 1 and 2, to identify the best quality OCR for use in further natural language processing tasks, with a wider view to link individual newspaper reports of crime in nineteenth-century London to the Digital Panopticon{---}a structured repository of criminal lives. We mitigate the absence of gold standard transcriptions of the BLN corpus by utilising a corpus of genre-adjacent texts that capture the common and legal parlance of nineteenth-century London{---}the Proceedings of the Old Bailey Online{---}with a view to rank the BLN transcriptions by their OCR quality."
}
Markdown (Informal)
[A Language Modelling Approach to Quality Assessment of OCR’ed Historical Text](https://preview.aclanthology.org/fix-sig-urls/2022.lrec-1.630/) (Booth et al., LREC 2022)
ACL