@inproceedings{micheli-etal-2020-importance,
title = "On the importance of pre-training data volume for compact language models",
author = "Micheli, Vincent and
d{'}Hoffschmidt, Martin and
Fleuret, Fran{\c{c}}ois",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.632/",
doi = "10.18653/v1/2020.emnlp-main.632",
pages = "7853--7858",
abstract = "Recent advances in language modeling have led to computationally intensive and resource-demanding state-of-the-art models. In an effort towards sustainable practices, we study the impact of pre-training data volume on compact language models. Multiple BERT-based models are trained on gradually increasing amounts of French text. Through fine-tuning on the French Question Answering Dataset (FQuAD), we observe that well-performing models are obtained with as little as 100 MB of text. In addition, we show that past critically low amounts of pre-training data, an intermediate pre-training step on the task-specific corpus does not yield substantial improvements."
}
Markdown (Informal)
[On the importance of pre-training data volume for compact language models](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.632/) (Micheli et al., EMNLP 2020)
ACL