@inproceedings{tal-etal-2026-modeling,
title = "Modeling the ``Dalet'' Clitic in Historical {H}ebrew Texts: A New Prefix-Segmented {BERT} Model and Stylistic Analysis",
author = "Tal, Rachel and
Shmidman, Cheyn Shmuel and
Shmidman, Avi",
editor = {Hamilton, Sil and
{\"O}hman, Emily and
Hicke, Rebecca M. M. and
Bizzoni, Yuri and
Bax, Axel and
Matthews, Jacob A. and
H{\"a}m{\"a}l{\"a}inen, Mika},
booktitle = "Proceedings of the 6th International Conference on Natural Language Processing for the Digital Humanities",
month = jul,
year = "2026",
address = "San Diego, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.12/",
pages = "121--131",
ISBN = "979-8-89176-427-9",
abstract = "The Aramaic proclitic *dalet*, widely used in historical Hebrew texts, serves two distinct grammatical functions: as a subordinating conjunction and as a possessive preposition. Because these functions are orthographically identical and no annotated resources exist for this task, large-scale computational analysis of their usage has previously been infeasible. In this paper we introduce a new BERT model for historical Hebrew in which all prefixes are segmented and encoded as independent tokens. This representation allows the model to evaluate proclitics directly and provides a probe-based unsupervised method for determining the grammatical role of the *dalet* clitic using masked language modeling predictions. We evaluate the approach on a manually annotated dataset drawn from historical Hebrew literature spanning multiple regions and historical periods, achieving over an average F1 score of over 0.89. Applying the method to a corpus of more than 300 million words of historical Hebrew texts, we conduct large-scale stylistic analyses of the choice between the Aramaic *dalet* and available Hebrew alternatives. The results reveal geographic and diachronic trends and identify distinct stylistic clusters within the corpus. The prefix-segmented model and annotated dataset are released for unrestricted use."
}Markdown (Informal)
[Modeling the "Dalet" Clitic in Historical Hebrew Texts: A New Prefix-Segmented BERT Model and Stylistic Analysis](https://preview.aclanthology.org/ingest-acl-workshops/2026.nlp4dh-1.12/) (Tal et al., NLP4DH 2026)
ACL