@inproceedings{lawrie-etal-2020-building,
title = "Building {OCR}/{NER} Test Collections",
author = "Lawrie, Dawn and
Mayfield, James and
Etter, David",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lrec-1.570/",
pages = "4639--4646",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "Named entity recognition (NER) identifies spans of text that contain names. Many researchers have reported the results of NER on text created through optical character recognition (OCR) over the past two decades. Unfortunately, the test collections that support this research are annotated with named entities after optical character recognition (OCR) has been run. This means that the collection must be re-annotated if the OCR output changes. Instead by tying annotations to character locations on the page, a collection can be built that supports OCR and NER research without requiring re-annotation when either improves. This means that named entities are annotated on the transcribed text. The transcribed text is all that is needed to evaluate the performance of OCR. For NER evaluation, the tagged OCR output is aligned to the transcriptions the aligned files, creating modified files of each, which are scored. This paper presents a methodology for building such a test collection and releases a collection of Chinese OCR-NER data constructed using the methodology. The paper provides performance baselines for current OCR and NER systems applied to this new collection."
}
Markdown (Informal)
[Building OCR/NER Test Collections](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.lrec-1.570/) (Lawrie et al., LREC 2020)
ACL
- Dawn Lawrie, James Mayfield, and David Etter. 2020. Building OCR/NER Test Collections. In Proceedings of the Twelfth Language Resources and Evaluation Conference, pages 4639–4646, Marseille, France. European Language Resources Association.