@inproceedings{kumar-etal-2025-pretraining,
title = "Pretraining Language Models with {L}o{RA} and Artificial Languages",
author = "Kumar, Nalin and
Lango, Mateusz and
Dusek, Ondrej",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.37/",
pages = "525--530",
ISBN = "TODO",
abstract = "Large language models (LLMs) require a substantial amount of training data, which contrasts with the data-efficient learning observed in humans. In our submission to the BabyLM Challenge, we address this disparity by proposing a parameter-efficient pretraining approach for language acquisition from limited data. Our approach involves initializing the model with token embeddings trained by a shallow model, followed by tuning the non-embedding parameters with non-linguistic data to introduce structural biases. Then, we freeze the resulting model and pretrain it on the 10M-token BabyLM corpus using LoRA adapters. Experiments on small corpora demonstrate that our approach improves upon classic pretraining of the entire model."
}Markdown (Informal)
[Pretraining Language Models with LoRA and Artificial Languages](https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.37/) (Kumar et al., BabyLM 2025)
ACL