@inproceedings{hill-etal-2026-catalogues-data,
title = "Catalogues as Data: Interpretable {NLP} Pipelines for {O}ttoman-{T}urkish Bibliographies",
author = "Hill, Mark and
Bulus, Ayse and
Spence, Paul",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.12/",
pages = "128--134",
ISBN = "979-8-89176-373-9",
abstract = {Bibliographies are both humanities infrastructure and historic record. To computationally analyse them, however, requires implementing complex digitisation and standardisation decisions. This paper turns to Seyfettin {\"O}zege{'}s Eski Harflerle Bas{\i}lm{\i}{\c{s}} T{\"u}rk{\c{c}}e Eserler Katalo{\u{g}}u as an example, a scanned set of volumes marked by complex page layouts, degraded typography, irregular entry structures, and historically contingent inconsistencies. With this we present a pipeline that constructs a structured, machine-readable, and analysable dataset out of the 27,000 entries with computer vision, OCR, large and visual language models, sequence-based validation, and custom review tools. This process captures 97.8{\%} of records, with remaining cases capable of being addressed by targeted review. This process demonstrates that combining LLMs with interpretable, review-centric pipelines, offers an appropriate approach for historically complex bibliographic sources.}
}Markdown (Informal)
[Catalogues as Data: Interpretable NLP Pipelines for Ottoman-Turkish Bibliographies](https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.12/) (Hill et al., LaTeCH-CLfL 2026)
ACL