@article{besrour-farber-2026-unarxive,
title = "unar{X}ive 2024: A Large-Scale Scientific Corpus for Citation-Aware Retrieval and Generation",
author = {Besrour, Ines and
F{\"a}rber, Michael},
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.556/",
pages = "6990--6997",
abstract = "Full-text collections of scientific papers are essential for NLP research and the training of language models. However, existing resources remain incomplete: they often lag behind the fast-paced growth of scientific publishing, lack comprehensive citation networks, and discard essential structural elements. In this work, we introduce unarXive 2024, a large-scale, richly structured corpus containing every arXiv submission from January 1991 to December 2024 {--} over 2.28 million documents across physics, mathematics, computer science, and other fields. Our release enhances each paper with detailed metadata, reconstructs a substantially more complete citation network than existing datasets, and preserves fine-grained structural information, including section boundaries, mathematical notation, and non-textual elements. Beyond the corpus itself, we provide dense and sparse indexes optimized for retrieval-augmented generation (RAG) over the full arXiv archive. All resources, including code and data, are publicly available: https://github.com/faerber-lab/unarXive-2024"
}Markdown (Informal)
[unarXive 2024: A Large-Scale Scientific Corpus for Citation-Aware Retrieval and Generation](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.556/) (Besrour & Färber, LREC 2026)
ACL