@inproceedings{velasco-roque-2025-scaling,
title = "Scaling, Simplification, and Adaptation: Lessons from Pretraining on Machine-Translated Text",
author = "Velasco, Dan John and
Roque, Matthew Theodore",
editor = "Adelani, David Ifeoluwa and
Arnett, Catherine and
Ataman, Duygu and
Chang, Tyler A. and
Gonen, Hila and
Raja, Rahul and
Schmidt, Fabian and
Stap, David and
Wang, Jiayi",
booktitle = "Proceedings of the 5th Workshop on Multilingual Representation Learning (MRL 2025)",
month = nov,
year = "2025",
address = "Suzhuo, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.mrl-main.40/",
pages = "612--630",
ISBN = "979-8-89176-345-6",
abstract = "Most languages lack sufficient data for largescale monolingual pretraining, creating a ``data wall.'' Multilingual pretraining helps but is limited by language imbalance and the ``curse of multilinguality.'' An alternative is to translate high-resource text with machine translation (MT), which raises three questions: (1) How does MT-derived data scale with model capacity? (2) Can source-side transformations (e.g., simplifying English with an LLM) improve generalization to native text? (3) How well do models pretrained on MT-derived data adapt when continually trained on limited native text? We investigate these questions by translating English into Indonesian and Tamil{---}two typologically distant, lowerresource languages{---}and pretraining GPT-2 models (124M{--}774M) on native or MT-derived corpora from raw and LLM-simplified English. We evaluate cross-entropy loss on native text, along with accuracy on syntactic probes and downstream tasks. Our results show that (1) MT-pretrained models benefit from scaling; (2) source-side simplification harms generalization to native text; and (3) adapting MT-pretrained models on native text often yields better performance than native-only models, even with less native data. However, tasks requiring cultural nuance (e.g., toxicity detection) demand more exposure to native data."
}Markdown (Informal)
[Scaling, Simplification, and Adaptation: Lessons from Pretraining on Machine-Translated Text](https://preview.aclanthology.org/ingest-emnlp/2025.mrl-main.40/) (Velasco & Roque, MRL 2025)
ACL