@inproceedings{guo-etal-2025-efficient-domain,
title = "Efficient Domain Continual pretraining by Mitigating the Stability Gap",
author = "Guo, Yiduo and
Fu, Jie and
Zhang, Huishuai and
Zhao, Dongyan",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1578/",
pages = "32850--32870",
ISBN = "979-8-89176-251-0",
abstract = "Continual pretraining enables Large Language Models (LLMs) to adapt to specialized domains like medicine and law. However, we observe a consistent phenomenon across different model sizes and domains: a temporary performance drop at the start of the continual pretraining process, followed by a performance recovery phase. To gain a deeper understanding of this issue, we use the stability gap{---} a concept adapted from the visual domain{---}which explains this initial drop arises from instability in the model{'}s general abilities. We validate this hypothesis through a series of experiments. To address this initial instability and enhance LLM performance within a fixed compute budget, we propose a training strategy that mitigates instability by increasing the number of epochs, alongside two data sampling strategies targeting data domain relevance and corpus distribution. We conduct experiments on Llama-family models to validate the effectiveness of our strategies for continual pretraining and instruction tuning in medical and legal domains. Our strategies improve the average medical task performance of the OpenLlama-3B model from 36.2{\%} to 40.7{\%} using only 40{\%} of the original training budget, while also enhancing general task performance without causing forgetting. Furthermore, we aPPLy our strategies to continually pre-train and instruction-tune the Llama-3-8B model. The resulting model, Llama-3-Physician, achieves the best medical performance among open-source models on several benchmarks and rivals GPT-4 on specific tasks. We release our models at https://huggingface.co/YiDuo1999/Llama-3-Physician-8B-Instruct."
}
Markdown (Informal)
[Efficient Domain Continual pretraining by Mitigating the Stability Gap](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1578/) (Guo et al., ACL 2025)
ACL