@inproceedings{kim-wu-2024-knowlabs,
title = "Knowlab{'}s Submission to {L}+{M} Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning",
author = "Kim, Yunsoo and
Wu, Honghan",
editor = "Edwards, Carl and
Wang, Qingyun and
Li, Manling and
Zhao, Lawrence and
Hope, Tom and
Ji, Heng",
booktitle = "Proceedings of the 1st Workshop on Language + Molecules (L+M 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.langmol-1.11/",
doi = "10.18653/v1/2024.langmol-1.11",
pages = "91--96",
abstract = "This paper presents our submission to the L+M-24 shared task, focused on translating molecular structures into natural language descriptions, known as the molecule captioning task. We selected a small language model (SLM), Phi-3-mini-4k, to evaluate the impact of continued pretraining and instruction tuning for domain-specific chemical knowledge. The Phi-3 model was continued pretrained with 90M chemistry textbooks and abstracts, followed by instruction tuning on 150K question answering sets of SMILES and general chemistry knowledge. Despite the continued pretraining phase not including direct exposure to SMILES representations, it significantly enhanced the Phi-3 model{'}s performance, a 300{\%} increase for the BLEU scores, in the molecule captioning task. The code and model are released at \url{https://github.com/bluesky333/Phi3KnowChem} to facilitate research in chemical small language modeling."
}
Markdown (Informal)
[Knowlab’s Submission to L+M Shared Task: All you need is continued pretraining of chemistry texts even for molecule captioning](https://preview.aclanthology.org/fix-sig-urls/2024.langmol-1.11/) (Kim & Wu, LangMol 2024)
ACL