@inproceedings{aynetdinov-akbik-2025-babies,
title = "Babies Learn to Look Ahead: Multi-Token Prediction in Small {LM}s",
author = "Aynetdinov, Ansar and
Akbik, Alan",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/setup/2025.babylm-main.41/",
pages = "566--577",
ISBN = "TODO",
abstract = "Multi-token prediction (MTP) is an alternative training objective for language models that has recently been proposed as a potential improvement over traditional next-token prediction (NTP). Instead of training models to predict only the next token, as is standard, MTP trains them to predict the next $k$ tokens at each step. While MTP was shown to improve downstream performance and sample efficiency in large language models (LLMs), smaller language models (SLMs) struggle with this objective. Recently, a curriculum-based approach was offered as a solution to this problem for models as small as 1.3B parameters by adjusting the difficulty of the training objective over time. In this work we investigate the viability of MTP curricula in a highly data- and parameter-constrained setting. Our experimental results show that even 130M-parameter models benefit from including the MTP task in the pre-training objective. These gains hold even under severe data constraints, as demonstrated on both zero-shot benchmarks and downstream tasks."
}Markdown (Informal)
[Babies Learn to Look Ahead: Multi-Token Prediction in Small LMs](https://preview.aclanthology.org/setup/2025.babylm-main.41/) (Aynetdinov & Akbik, BabyLM 2025)
ACL