@inproceedings{xie-etal-2024-efficient,
title = "Efficient Continual Pre-training for Building Domain Specific Large Language Models",
author = "Xie, Yong and
Aggarwal, Karan and
Ahmad, Aitzaz",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.606/",
doi = "10.18653/v1/2024.findings-acl.606",
pages = "10184--10201",
abstract = "Large language models (LLMs) have demonstrated remarkable open-domain capabilities. LLMs tailored for a domain are typically trained entirely on domain corpus to excel at handling domain-specific tasks. In this work, we explore an alternative strategy of continual pre-training as a means to develop domain-specific LLMs over an existing open-domain LLM. We introduce \textit{FinPythia-6.9B}, developed through domain-adaptive continual pre-training on the financial domain.Continual pre-trained FinPythia showcases consistent improvements on financial tasks over the original foundational model. We further explore simple but effective data selection strategies for continual pre-training. Our data selection strategies outperform vanilla continual pre-training`s performance with just 10{\%} of corpus size and cost, without any degradation on open-domain standard tasks. Our work proposes an alternative solution to building domain-specific LLMs cost-effectively."
}
Markdown (Informal)
[Efficient Continual Pre-training for Building Domain Specific Large Language Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-acl.606/) (Xie et al., Findings 2024)
ACL