@inproceedings{yuan-etal-2025-legomt2,
title = "{L}ego{MT}2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation",
author = "Yuan, Fei and
Lu, Yinquan and
Li, Lei and
Xu, Jingjing",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1200/",
pages = "23359--23376",
ISBN = "979-8-89176-256-5",
abstract = "It is a critical challenge to learn a single model for massive languages. Prior methods focus on increasing the model size and training data size. However, large models are difficult to optimize efficiently even with distributed parallel training and translation capacity can interfere among languages. To address the challenge, we propose LegoMT2, an efficient training approach with an asymmetric multi-way model architecture for massive multilingual neural machine translation. LegoMT2 shards 435 languages into 8 language-centric groups and attributes one local encoder for each group{'}s languages and a mix encoder-decoder for all languages. LegoMT2 trains the model through local data parallel and asynchronous distributed updating of parameters. LegoMT2 is 16.2$\times$ faster than the distributed training method for M2M-100-12B (which only for 100 languages) while improving the translation performance by an average of 2.2 BLEU on \textit{Flores-101}, especially performing better for low-resource languages ."
}
Markdown (Informal)
[LegoMT2: Selective Asynchronous Sharded Data Parallel Training for Massive Neural Machine Translation](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.1200/) (Yuan et al., Findings 2025)
ACL