@inproceedings{manjavacas-arevalo-fonteyn-2021-macberth,
title = "{M}ac{BERT}h: Development and Evaluation of a Historically Pre-trained Language Model for {E}nglish (1450-1950)",
author = "Manjavacas Arevalo, Enrique and
Fonteyn, Lauren",
editor = {H{\"a}m{\"a}l{\"a}inen, Mika and
Alnajjar, Khalid and
Partanen, Niko and
Rueter, Jack},
booktitle = "Proceedings of the Workshop on Natural Language Processing for Digital Humanities",
month = dec,
year = "2021",
address = "NIT Silchar, India",
publisher = "NLP Association of India (NLPAI)",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.nlp4dh-1.4/",
pages = "23--36",
abstract = "The new pre-train-then-fine-tune paradigm in Natural made important performance gains accessible to a wider audience. Once pre-trained, deploying a large language model presents comparatively small infrastructure requirements, and offers robust performance in many NLP tasks. The Digital Humanities community has been an early adapter of this paradigm. Yet, a large part of this community is concerned with the application of NLP algorithms to historical texts, for which large models pre-trained on contemporary text may not provide optimal results. In the present paper, we present {\textquotedblleft}MacBERTh{\textquotedblright}{---}a transformer-based language model pre-trained on historical English{---}and exhaustively assess its benefits on a large set of relevant downstream tasks. Our experiments highlight that, despite some differences across target time periods, pre-training on historical language from scratch outperforms models pre-trained on present-day language and later adapted to historical language."
}
Markdown (Informal)
[MacBERTh: Development and Evaluation of a Historically Pre-trained Language Model for English (1450-1950)](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.nlp4dh-1.4/) (Manjavacas Arevalo & Fonteyn, NLP4DH 2021)
ACL