@inproceedings{zain-etal-2025-single,
title = "Single layer tiny Co4 outpaces {GPT}-2 and {GPT}-{BERT}",
author = "Zain, Noor Ul and
Naseem, Mohsin Raza and
Adeel, Ahsan",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.24/",
pages = "313--322",
ISBN = "TODO",
abstract = "We show that a tiny $Co^{4}$ machine (CITATION) with a single layer, two heads, and 8M parameters, operating at $O(N)$ computational cost (where \textit{N} is the number of input tokens), in just 2 epochs outpaces GPT-2 (124M, 12 layers, $O(N^2)$) and GPT-BERT (30M, 12 layers, $O(N^2)$), both trained for 10 epochs. $Co^{4}$ achieves orders-of-magnitude greater training efficiency on 10M tokens, demonstrating sample-efficient pretraining. On the BabyLM challenge evaluation pipeline, $Co^{4}$ performs comparably or better across complex benchmarks, showing strong zero-shot and fine-tuning performance on SuperGLUE tasks. Specifically, $Co^{4}$ outperforms GPT-2 in 5 out of 7 zero-shot metrics and 6 out of 7 fine-tuning tasks, and GPT-BERT in 4 out of 7 metrics in both cases. These results strongly suggest a need to rethink prevailing deep learning paradigms and associated scaling laws."
}Markdown (Informal)
[Single layer tiny Co4 outpaces GPT-2 and GPT-BERT](https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.24/) (Zain et al., BabyLM 2025)
ACL