@inproceedings{gao-etal-2024-kd,
title = "{VE}-{KD}: Vocabulary-Expansion Knowledge-Distillation for Training Smaller Domain-Specific Language Models",
author = "Gao, Pengju and
Yamasaki, Tomohiro and
Imoto, Kazunori",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.884/",
doi = "10.18653/v1/2024.findings-emnlp.884",
pages = "15046--15059",
abstract = "We propose VE-KD, a novel method that balances knowledge distillation and vocabulary expansion with the aim of training efficient domain-specific language models. Compared with traditional pre-training approaches, VE-KD exhibits competitive performance in downstream tasks while reducing model size and using fewer computational resources. Additionally, VE-KD refrains from overfitting in domain adaptation. Our experiments with different biomedical domain tasks demonstrate that VE-KD performs well compared with models such as BioBERT (+1{\%} at HoC) and PubMedBERT (+1{\%} at PubMedQA), with about 96{\%} less training time. Furthermore, it outperforms DistilBERT and Adapt-and-Distill, showing a significant improvement in document-level tasks. Investigation of vocabulary size and tolerance, which are hyperparameters of our method, provides insights for further model optimization. The fact that VE-KD consistently maintains its advantages, even when the corpus size is small, suggests that it is a practical approach for domain-specific language tasks and is transferrable to different domains for broader applications."
}
Markdown (Informal)
[VE-KD: Vocabulary-Expansion Knowledge-Distillation for Training Smaller Domain-Specific Language Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.884/) (Gao et al., Findings 2024)
ACL