@inproceedings{tapaninaho-2025-moep,
title = "{M}o{EP}: Modular Expert Paths for Sample-Efficient Language Modeling",
author = "Tapaninaho, Joonas",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.babylm-main.39/",
doi = "10.18653/v1/2025.babylm-main.39",
pages = "540--547",
ISBN = "TODO",
abstract = "Training language models under tight compute budgets with small training datasets remains challenging for dense decoder-only Transformers, where every token activates the full stack of model parameters. We introduce MoEP (Modular Expert Paths), a sparse decoder-only architecture that enables more selective token activation, which increases model performance and accelerates learning without increasing the total number of parameters. We show that combining model parallelism with Mixture-of-Experts (MoE) style linear projections and a lightweight top-k router outperforms the GPT-2 baseline and stabilizes evaluation performance more quickly."
}Markdown (Informal)
[MoEP: Modular Expert Paths for Sample-Efficient Language Modeling](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.babylm-main.39/) (Tapaninaho, BabyLM 2025)
ACL