@inproceedings{soni-etal-2019-correcting,
title = "Correcting Whitespace Errors in Digitized Historical Texts",
author = "Soni, Sandeep and
Klein, Lauren and
Eisenstein, Jacob",
editor = "Alex, Beatrice and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Reiter, Nils and
Szpakowicz, Stan",
booktitle = "Proceedings of the 3rd Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature",
month = jun,
year = "2019",
address = "Minneapolis, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-2513/",
doi = "10.18653/v1/W19-2513",
pages = "98--103",
abstract = "Whitespace errors are common to digitized archives. This paper describes a lightweight unsupervised technique for recovering the original whitespace. Our approach is based on count statistics from Google n-grams, which are converted into a likelihood ratio test computed from interpolated trigram and bigram probabilities. To evaluate this approach, we annotate a small corpus of whitespace errors in a digitized corpus of newspapers from the 19th century United States. Our technique identifies and corrects most whitespace errors while introducing a minimal amount of oversegmentation: it achieves 77{\%} recall at a false positive rate of less than 1{\%}, and 91{\%} recall at a false positive rate of less than 3{\%}."
}
Markdown (Informal)
[Correcting Whitespace Errors in Digitized Historical Texts](https://preview.aclanthology.org/jlcl-multiple-ingestion/W19-2513/) (Soni et al., LaTeCH 2019)
ACL
- Sandeep Soni, Lauren Klein, and Jacob Eisenstein. 2019. Correcting Whitespace Errors in Digitized Historical Texts. In Proceedings of the 3rd Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature, pages 98–103, Minneapolis, USA. Association for Computational Linguistics.