@inproceedings{bian-etal-2026-efficiently-explore,
title = "How to Efficiently Explore Noisy Historical Data? Leveraging Corpus Pre-Targeting to Enhance Graph-based {RAG}",
author = "Bian, Donghan and
Puren, Marie and
Cafiero, Florian",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.23/",
pages = "241--250",
ISBN = "979-8-89176-373-9",
abstract = "Graph-based Retrieval-Augmented Generation (RAG) is increasingly used to explore long, heterogeneous, and weakly structured corpora, including historical archives. However, in such settings, naive full-corpus indexing is often computationally costly and sensitive to OCR noise, document redundancy, and topical dispersion. In this paper, we investigate corpus pre-targeting strategies as an intermediate layer to improve the efficiency and effectiveness of graph-based RAG for historical research.We evaluate a set of pre-targeting heuristics tailored to single-hop and multi-hop of historical questions on HistoriQA-ThirdRepublic, a French question-answering dataset derived from parliamentary debates and contemporary newspapers. Our results show that appropriate pre-targeting strategies can improve retrieval recall by 3{--}5{\%} while reducing token consumption by 32{--}37{\%} compared to full-corpus indexing, without degrading coverage of relevant documents.Beyond performance gains, this work highlights the importance of corpus-level optimization for applying RAG to large-scale historical collections, and provides practical insights for adapting graph-based RAG pipelines to the specific constraints of digitized archives."
}Markdown (Informal)
[How to Efficiently Explore Noisy Historical Data? Leveraging Corpus Pre-Targeting to Enhance Graph-based RAG](https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.23/) (Bian et al., LaTeCH-CLfL 2026)
ACL