@article{tremblay-taillon-langlais-2026-escibench,
title = "e{S}ci{B}ench: An Extensible Scientific {PDF} Extraction Benchmark",
author = "Tremblay Taillon, Noah and
Langlais, Phillippe",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.600/",
pages = "7568--7580",
abstract = "Automatically extracting information from PDF documents (such as authors, affiliations, references, tables, equations) may be transformative in Digital Humanities where meta-data accompanying a document is typically manually collected, a cumbersome process. In this work, we conduct a systematic benchmarking of PDF extractors on a set of 100 scientific articles (1949 pages) of the STEM domain that have been processed automatically, then carefully curated. Our benchmark, named eSciBench is openly accessible. Putting to the test 13 extractors on it reveals that although some extractors perform well overall, extracting information from scientific articles is far from a solved problem."
}Markdown (Informal)
[eSciBench: An Extensible Scientific PDF Extraction Benchmark](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.600/) (Tremblay Taillon & Langlais, LREC 2026)
ACL