@inproceedings{thomas-etal-2024-leveraging,
title = "Leveraging {LLM}s for Post-{OCR} Correction of Historical Newspapers",
author = "Thomas, Alan and
Gaizauskas, Robert and
Lu, Haiping",
editor = "Sprugnoli, Rachele and
Passarotti, Marco",
booktitle = "Proceedings of the Third Workshop on Language Technologies for Historical and Ancient Languages (LT4HALA) @ LREC-COLING-2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lt4hala-1.14/",
pages = "116--121",
abstract = "Poor OCR quality continues to be a major obstacle for humanities scholars seeking to make use of digitised primary sources such as historical newspapers. Typical approaches to post-OCR correction employ sequence-to-sequence models for a neural machine translation task, mapping erroneous OCR texts to accurate reference texts. We shift our focus towards the adaptation of generative LLMs for a prompt-based approach. By instruction-tuning Llama 2 and comparing it to a fine-tuned BART on BLN600, a parallel corpus of 19th century British newspaper articles, we demonstrate the potential of a prompt-based approach in detecting and correcting OCR errors, even with limited training data. We achieve a significant enhancement in OCR quality with Llama 2 outperforming BART, achieving a 54.51{\%} reduction in the character error rate against BART{'}s 23.30{\%}. This paves the way for future work leveraging generative LLMs to improve the accessibility and unlock the full potential of historical texts for humanities research."
}
Markdown (Informal)
[Leveraging LLMs for Post-OCR Correction of Historical Newspapers](https://preview.aclanthology.org/fix-sig-urls/2024.lt4hala-1.14/) (Thomas et al., LT4HALA 2024)
ACL