@inproceedings{yano-etal-2025-step,
title = "{STEP}: Staged Parameter-Efficient Pre-training for Large Language Models",
author = "Yano, Kazuki and
Ito, Takumi and
Suzuki, Jun",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-short.32/",
pages = "374--384",
ISBN = "979-8-89176-190-2",
abstract = "Pre-training large language models (LLMs) faces significant memory challenges due to the large size of model weights. We introduce STaged parameter-Efficient Pre-training (STEP), which integrates parameter-efficient tuning techniques with model growth. We conduct experiments on pre-training LLMs of various sizes and demonstrate that STEP achieves up to a 53.9{\%} reduction in maximum memory requirements compared to vanilla pre-training while maintaining equivalent performance. Furthermore, we show that the model by STEP performs comparably to vanilla pre-trained models on downstream tasks after instruction tuning."
}
Markdown (Informal)
[STEP: Staged Parameter-Efficient Pre-training for Large Language Models](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-short.32/) (Yano et al., NAACL 2025)
ACL
- Kazuki Yano, Takumi Ito, and Jun Suzuki. 2025. STEP: Staged Parameter-Efficient Pre-training for Large Language Models. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers), pages 374–384, Albuquerque, New Mexico. Association for Computational Linguistics.