@inproceedings{burns-etal-2026-aleph, title = "Aleph-Alpha-{G}erman{W}eb: Improving {G}erman-language {LLM} pre-training with model-based data curation and synthetic data generation", author = {Burns, Thomas F and Parcalabescu, Letitia and Waeldchen, Stephan and Barlow, Michael and Ziegltrum, Gregor and Stampa, Volker and Harren, Bastian and Deiseroth, Bj{\"o}rn}, editor = "Demberg, Vera and Inui, Kentaro and Marquez, Llu{\'i}s", booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)", month = mar, year = "2026", address = "Rabat, Morocco", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.58/", pages = "1267--1283", ISBN = "979-8-89176-380-7" }