@inproceedings{zhang-etal-2024-incremental,
title = "Incremental pre-training from smaller language models",
author = "Zhang, Han and
Wang, Hui and
Xu, Ruifeng",
editor = "Wong, Kam-Fai and
Zhang, Min and
Xu, Ruifeng and
Li, Jing and
Wei, Zhongyu and
Gui, Lin and
Liang, Bin and
Zhao, Runcong",
booktitle = "Proceedings of the 10th SIGHAN Workshop on Chinese Language Processing (SIGHAN-10)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.sighan-1.5/",
pages = "36--44",
abstract = "Large language models have recently become a new learning paradigm and led to state-of-the-art performance across a range of tasks. As explosive open-source pre-trained models are available, it is worth investigating how to better utilize existing models. We propose a simple yet effective method, Incr-Pretrain, for incrementally pre-training language models from smaller well-trained source models. Different layer-wise transfer strategies were introduced for model augmentation including parameter copying, initial value padding, and model distillation. Experiments on multiple zero-shot learning tasks demonstrate satisfying inference performance upon transferring and promising training efficiency during continuing pre-training. Compared to training from scratch, Incr-Pretrain can save up to half the training time to get a similar testing loss."
}
Markdown (Informal)
[Incremental pre-training from smaller language models](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.sighan-1.5/) (Zhang et al., SIGHAN 2024)
ACL