@inproceedings{loaiciga-etal-2025-exploring,
title = "Exploring smaller batch sizes for a high-performing {B}aby{LM} model architecture",
author = "Lo{\'a}iciga, Sharid and
Fysikoudi, Eleni and
Sayeed, Asad B.",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.12/",
pages = "155--159",
ISBN = "TODO",
abstract = "We explore the conditions under which the highest-performing entry to the BabyLM task in 2023, Every Layer Counts BERT or ELC-BERT, is best-performing given more constrained resources than the original run, with a particular focus on batch size. ELC-BERT{'}s relative success, as an instance of model engineering compared to more cognitively-motivated architectures, could be taken as evidence that the ``lowest-hanging'' fruit is to be found from non-linguistic machine learning approaches. We find that if we take away the advantage of training time from ELC-BERT, the advantage of the architecture mostly disappears, but some hyperparameter combinations nevertheless differentiate themselves in performance."
}Markdown (Informal)
[Exploring smaller batch sizes for a high-performing BabyLM model architecture](https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.12/) (Loáiciga et al., BabyLM 2025)
ACL