@article{yilandiloglu-2026-llms,
title = "{LLM}s in {O}ttoman {T}urkish: From {MLM} to {NER}",
author = "Y{\i}landilo{\u{g}}lu, Enes",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.281/",
pages = "3517--3522",
abstract = "This paper introduces three foundational contributions to Digital Ottoman Turkish Studies. It presents: (1) three masked language models (MLMs) trained on over 11 million words from 144 works spanning from the 15th to 20th century, (2) a state-of-the-art Named Entity Recognition (NER) model (F1 = 89.94{\%}) trained on 9,960 manually annotated entities, and (3) a state-of-the-art Universal Dependency (UD) parsing model for Ottoman Turkish. This work differs from others by deploying IJMES-transliterated documents for training and evaluation in order to prevent loss of information due to the change of the script from Perso-Arabic to Latin. The paper further explores probabilistic manuscript reconstruction in preliminary experiments, showing that MLMs can recover unread sections in historical documents with 77.8{\%} top-1 accuracy when a list of candidate words is provided. Followed by a discussion, the paper outlines the future directions as building century-aware MLMs and expanding the training data across genres to enhance model generalization."
}Markdown (Informal)
[LLMs in Ottoman Turkish: From MLM to NER](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.281/) (Yılandiloğlu, LREC 2026)
ACL