@inproceedings{voinea-2026-validator,
title = "Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages",
author = "Voinea, Andrei",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.69/",
pages = "779--790",
ISBN = "979-8-89176-393-7",
abstract = "Masked language modeling for low-resource ancient languages remains challenging because pre-trained multilingual models lack exposure to these languages. We investigate rule-based linguistic constraints and hard negative mining for Sumerian, a language isolate not included in multilingual BERT{'}s training data. We build a hierarchical validator capturing subword, word, and part-of-speech patterns from 4,545 annotated sequences, using it to filter candidates and identify hard negatives for fine-tuning. Vanilla mBERT achieves 18.0{\%} hit@10 accuracy. The validator alone improves this to 72.8{\%}, while hard negative fine-tuning reaches 78.3{\%}. Combining both yields 86.7{\%}, a 68.7 percentage point improvement. Temporal generalization evaluation on tablets from 600 years earlier shows that both the hard negative mining and the validator alone improve performance, but the combined approach underperforms due to the validator{'}s period specific rules. These findings demonstrate that hard negative mining transfers across periods while explicit rule-based constraints provide strong in-domain improvements but limited cross-period generalization."
}Markdown (Informal)
[Validator-Guided Hard Negative Mining for Masked Language Modeling in Low-Resource Ancient Languages](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.69/) (Voinea, ACL 2026)
ACL