@inproceedings{barre-2026-search,
title = "In Search of Lost Adventure Novels: Supervised Genre Retrieval and Corpus Refinement in Gallica",
author = "Barr{\'e}, Jean",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.24/",
pages = "255--263",
ISBN = "979-8-89176-427-9",
abstract = "This paper addresses a practical problem in computational literary history: retrieving adventure novels from a large digitized collection of French fiction where genre metadata are sparse and unreliable. We begin with supervised genre modeling based on a historically situated seed list of 101 adventure novels drawn from literary scholarship. We compare several classifiers and representations, and validate them against 364 independently labeled adventure novels from the Chapitres corpus. The best-performing model, HistGradientBoosting on mean paragraph embeddings, achieves strong external recall (81{\%}) despite the small training set. We then apply this model to the 12,176-novel Fictions littde Gallica archive and refine the resulting candidate corpus through a graph-based post-processing step over a $k$-nearest-neighbor similarity graph. On the Chapitres benchmark, this graph correction produces negligible changes in retrieval performance, indicating that it is not a generally superior classifier. On Gallica, however, it yields a more cohesive and homogeneous candidate corpus and surfaces interpretable correction cases, including missed canonical adventure novels and excluded borderline texts. We therefore argue that graph-based correction is best understood not as a replacement for supervised classification, but as a heuristic for refining large, noisy archival corpora where exhaustive manual annotation is impossible."
}Markdown (Informal)
[In Search of Lost Adventure Novels: Supervised Genre Retrieval and Corpus Refinement in Gallica](https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.24/) (Barré, NLP4DH 2026)
ACL