@inproceedings{guan-etal-2024-effective,
title = "Effective Synthetic Data and Test-Time Adaptation for {OCR} Correction",
author = "Guan, Shuhao and
Xu, Cheng and
Lin, Moule and
Greene, Derek",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.862/",
doi = "10.18653/v1/2024.emnlp-main.862",
pages = "15412--15425",
abstract = "Post-OCR technology is used to correct errors in the text produced by OCR systems. This study introduces a method for constructing post-OCR synthetic data with different noise levels using weak supervision. We define Character Error Rate (CER) thresholds for {\textquotedblleft}effective{\textquotedblright} and {\textquotedblleft}ineffective{\textquotedblright} synthetic data, allowing us to create more useful multi-noise level synthetic datasets. Furthermore, we propose Self-Correct-Noise Test-Time Adaptation (SCN-TTA), which combines self-correction and noise generation mechanisms. SCN-TTA allows a model to dynamically adjust to test data without relying on labels, effectively handling proper nouns in long texts and further reducing CER. In our experiments we evaluate a range of models, including multiple PLMs and LLMs. Results indicate that our method yields models that are effective across diverse text types. Notably, the ByT5 model achieves a CER reduction of 68.67{\%} without relying on manually annotated data"
}
Markdown (Informal)
[Effective Synthetic Data and Test-Time Adaptation for OCR Correction](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/2024.emnlp-main.862/) (Guan et al., EMNLP 2024)
ACL