@inproceedings{pfister-etal-2025-llammlein,
title = {{LL}{\"a}{M}mlein: Transparent, Compact and Competitive {G}erman-Only Language Models from Scratch},
author = "Pfister, Jan and
Wunderle, Julia and
Hotho, Andreas",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.111/",
pages = "2227--2246",
ISBN = "979-8-89176-251-0",
abstract = {We transparently create two German-only decoder models, LL{\"a}Mmlein 120M and 1B, from scratch and publish them, along with the training data, for the (German) NLP research community to use. The model training involved several key steps, including data preprocessing/filtering, the creation of a German tokenizer, the training itself, as well as the evaluation of the final models on various benchmarks, also against existing models. Throughout the training process, multiple checkpoints were saved in equal intervals and analyzed using the German SuperGLEBer benchmark to gain insights into the models' learning process.Compared to state-of-the-art models on the SuperGLEBer benchmark, both LL{\"a}Mmlein models performed competitively, consistently matching or surpassing models with similar parameter sizes. The results show that the models' quality scales with size as expected, but performance improvements on some tasks plateaued early during training, offering valuable insights into resource allocation for future models.}
}
Markdown (Informal)
[LLäMmlein: Transparent, Compact and Competitive German-Only Language Models from Scratch](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.111/) (Pfister et al., ACL 2025)
ACL