@inproceedings{ogueji-etal-2021-small,
title = "Small Data? No Problem! Exploring the Viability of Pretrained Multilingual Language Models for Low-resourced Languages",
author = "Ogueji, Kelechi and
Zhu, Yuxin and
Lin, Jimmy",
editor = "Ataman, Duygu and
Birch, Alexandra and
Conneau, Alexis and
Firat, Orhan and
Ruder, Sebastian and
Sahin, Gozde Gul",
booktitle = "Proceedings of the 1st Workshop on Multilingual Representation Learning",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.11/",
doi = "10.18653/v1/2021.mrl-1.11",
pages = "116--126",
abstract = "Pretrained multilingual language models have been shown to work well on many languages for a variety of downstream NLP tasks. However, these models are known to require a lot of training data. This consequently leaves out a huge percentage of the world{'}s languages as they are under-resourced. Furthermore, a major motivation behind these models is that lower-resource languages benefit from joint training with higher-resource languages. In this work, we challenge this assumption and present the first attempt at training a multilingual language model on only low-resource languages. We show that it is possible to train competitive multilingual language models on less than 1 GB of text. Our model, named AfriBERTa, covers 11 African languages, including the first language model for 4 of these languages. Evaluations on named entity recognition and text classification spanning 10 languages show that our model outperforms mBERT and XLM-Rin several languages and is very competitive overall. Results suggest that our ``small data'' approach based on similar languages may sometimes work better than joint training on large datasets with high-resource languages. Code, data and models are released at \url{https://github.com/keleog/afriberta}."
}
Markdown (Informal)
[Small Data? No Problem! Exploring the Viability of Pretrained Multilingual Language Models for Low-resourced Languages](https://preview.aclanthology.org/fix-sig-urls/2021.mrl-1.11/) (Ogueji et al., MRL 2021)
ACL