@inproceedings{zevallos-etal-2023-frequency,
title = "Frequency Balanced Datasets Lead to Better Language Models",
author = "Zevallos, Rodolfo and
Farr{\'u}s, Mireia and
Bel, N{\'u}ria",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.527/",
doi = "10.18653/v1/2023.findings-emnlp.527",
pages = "7859--7872",
abstract = "This paper reports on the experiments aimed to improve our understanding of the role of the amount of data required for training attention-based transformer language models. Specifically, we investigate the impact of reducing the immense amounts of required pre-training data through sampling strategies that identify and reduce high-frequency tokens as different studies have indicated that the existence of very high-frequency tokens in pre-training data might bias learning, causing undesired effects. In this light, we describe our sampling algorithm that iteratively assesses token frequencies and removes sentences that contain still high-frequency tokens, eventually delivering a balanced, linguistically correct dataset. We evaluate the results in terms of model perplexity and fine-tuning linguistic probing tasks, NLP downstream tasks as well as more semantic SuperGlue tasks. The results show that pre-training with the resulting balanced dataset allows reducing up to three times the pre-training data."
}
Markdown (Informal)
[Frequency Balanced Datasets Lead to Better Language Models](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.527/) (Zevallos et al., Findings 2023)
ACL