@inproceedings{thompson-etal-2026-bringing,
title = "Bringing {M}apudungun into the {M}odern {MT} Ecosystem: Morphology-Aware Tokenization for {NLLB}-200 Fine-Tuning",
author = "Thompson, Isaac and
Rogers, Brandon and
Ringger, Eric",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.16/",
pages = "173--185",
ISBN = "979-8-89176-415-6",
abstract = "For Mapudungun arn{\textrightarrow}es translation, morphology-aware tokenization can substitute for a 5{\texttimes} increase in model parameters. We fine-tune three sizes of Meta{'}s NLLB-200 on Mapudungun{--}Spanish translation across eight tokenization strategies, including our novel Morfessor-VC method, whichconstrains Morfessor morpheme segmentation to tokens already present in NLLB{'}s pretrainedvocabulary. Our 600M Morfessor-VC model is competitive with our own fine-tuned 3.3B Standard BPE model on arn{\textrightarrow}es (43.2 vs. 42.9 chrF++, {\ensuremath{\Delta}} = +0.3, p = 0.039, 95{\%} CI [0.02, 0.60]) while using five times fewer parameters, and all fine-tuned conditions surpass frontier LLMs by over 27 chrF++. Mapudungun is an indigenous polysynthetic language spoken by 200,000+ Mapuche people in Chile and Argentina, absent from NLLB-200 and not supported by major commercial MT providers; prior work predates large-scale multilingual models and does not address the tokenization challenges posed by its agglutinativemorphology. These results establish new state-of-the-art baselines for Mapudungun MT and provide a practical foundation for community language tools in pedagogy, social media, and language revitalization."
}Markdown (Informal)
[Bringing Mapudungun into the Modern MT Ecosystem: Morphology-Aware Tokenization for NLLB-200 Fine-Tuning](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.16/) (Thompson et al., AmericasNLP 2026)
ACL