@inproceedings{ahuja-etal-2022-parsing,
title = "Parsing Electronic Theses and Dissertations Using Object Detection",
author = "Ahuja, Aman and
Devera, Alan and
Fox, Edward Alan",
editor = "Ghosal, Tirthankar and
Blanco-Cuaresma, Sergi and
Accomazzi, Alberto and
Patton, Robert M. and
Grezes, Felix and
Allen, Thomas",
booktitle = "Proceedings of the first Workshop on Information Extraction from Scientific Publications",
month = nov,
year = "2022",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.wiesp-1.14/",
doi = "10.18653/v1/2022.wiesp-1.14",
pages = "121--130",
abstract = "Electronic theses and dissertations (ETDs) contain valuable knowledge that can be useful for a wide range of purposes. To effectively utilize the knowledge contained in ETDs for downstream tasks such as search and retrieval, question-answering, and summarization, the data first needs to be parsed and stored in a format such as XML. However, since most of the ETDs available on the web are PDF documents, parsing them to make their data useful for downstream tasks is a challenge. In this work, we propose a dataset and a framework to help with parsing long scholarly documents such as ETDs. We take the Object Detection approach for document parsing. We first introduce a set of objects that are important elements of an ETD, along with a new dataset ETD-OD that consists of over 25K page images originating from 200 ETDs with bounding boxes around each of the objects. We also propose a framework that utilizes this dataset for converting ETDs to XML, which can further be used for ETD-related downstream tasks. Our code and pre-trained models are available at: \url{https://github.com/Opening-ETDs/ETD-OD}."
}
Markdown (Informal)
[Parsing Electronic Theses and Dissertations Using Object Detection](https://preview.aclanthology.org/fix-sig-urls/2022.wiesp-1.14/) (Ahuja et al., WIESP 2022)
ACL