@inproceedings{pfeiffer-etal-2021-unks,
title = "{UNK}s Everywhere: {A}dapting Multilingual Language Models to New Scripts",
author = "Pfeiffer, Jonas and
Vuli{\'c}, Ivan and
Gurevych, Iryna and
Ruder, Sebastian",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2021",
address = "Online and Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.emnlp-main.800/",
doi = "10.18653/v1/2021.emnlp-main.800",
pages = "10186--10203",
abstract = "Massively multilingual language models such as multilingual BERT offer state-of-the-art cross-lingual transfer performance on a range of NLP tasks. However, due to limited capacity and large differences in pretraining data sizes, there is a profound performance gap between resource-rich and resource-poor target languages. The ultimate challenge is dealing with under-resourced languages not covered at all by the models and written in scripts unseen during pretraining. In this work, we propose a series of novel data-efficient methods that enable quick and effective adaptation of pretrained multilingual models to such low-resource languages and unseen scripts. Relying on matrix factorization, our methods capitalize on the existing latent knowledge about multiple languages already available in the pretrained model`s embedding matrix. Furthermore, we show that learning of the new dedicated embedding matrix in the target language can be improved by leveraging a small number of vocabulary items (i.e., the so-called lexically overlapping tokens) shared between mBERT`s and target language vocabulary. Our adaptation techniques offer substantial performance gains for languages with unseen scripts. We also demonstrate that they can yield improvements for low-resource languages written in scripts covered by the pretrained model."
}
Markdown (Informal)
[UNKs Everywhere: Adapting Multilingual Language Models to New Scripts](https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.emnlp-main.800/) (Pfeiffer et al., EMNLP 2021)
ACL