@inproceedings{najem-meyer-etal-2025-dont,
title = "Don{'}t stop pretraining! Efficiently building specialised language models in resource-constrained settings.",
author = "Najem-Meyer, Sven and
Kaplan, Fr{\'e}d{\'e}ric and
Romanello, Matteo",
editor = "Kazantseva, Anna and
Szpakowicz, Stan and
Degaetano-Ortlieb, Stefania and
Bizzoni, Yuri and
Pagel, Janis",
booktitle = "Proceedings of the 9th Joint SIGHUM Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature (LaTeCH-CLfL 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.latechclfl-1.22/",
pages = "252--260",
ISBN = "979-8-89176-241-1",
abstract = "Developing specialised language models for low-resource domains typically involves a trade-off between two specialisation strategies: adapting a general-purpose model through continued pretraining or retraining a model from scratch. While adapting preserves the model{'}s linguistic knowledge, retraining benefits from the flexibility of an in-domain tokeniser {--} a potentially significant advantage when handling rare languages. This study investigates the impact of tokenisation, specialisation strategy, and pretraining data availability using classical scholarship {--} a multilingual, code-switching and highly domain-specific field {--} as a case study. Through extensive experiments, we assess whether domain-specific tokenisation improves model performance, whether character-based models provide a viable alternative to subword-based models, and which specialisation strategy is optimal given the constraints of limited pretraining data. Contrary to prior findings, our results show that in-domain tokenisation does not necessarily enhance performance. Most notably, adaptation consistently outperforms retraining, even with limited data, confirming its efficiency as the preferred strategy for resource-constrained domains. These insights provide valuable guidelines for developing specialised models in fields with limited textual resources."
}
Markdown (Informal)
[Don’t stop pretraining! Efficiently building specialised language models in resource-constrained settings.](https://preview.aclanthology.org/fix-sig-urls/2025.latechclfl-1.22/) (Najem-Meyer et al., LaTeCHCLfL 2025)
ACL