@inproceedings{gren-kurfali-2026-efficient,
title = "Efficient Low-Resource Language Models Using Tokenizer Transfer",
author = "Gren, Gustaf and
Kurfali, Murathan",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.49/",
pages = "639--648",
ISBN = "979-8-89176-383-8",
abstract = "Training a language model for low-resource languages is challenging due to data scarcity and computational cost. Tokenizer transfer offers a way to adapt a pre-trained model to a new tokenizer without full retraining, improving efficiency and cross-lingual applicability. To the best our of knowledge, we present the first controlled evaluation of tokenizer transfer on monolingually pretrained base models trained on language-specific corpora, Orthogonal Mapping Pursuit (OMP) and Fast Vocabulary Transfer (FVT), across six languages and multiple finetuning regimes. Using the Goldfish model family, we evaluate using byte-normalized log-perplexity and MultiBlimp accuracy for target-language adaptability, source-language retention, and the interaction between transfer and monolingual or mixed finetuning. OMP with monolingual target finetuning yields the best target-language scores (lower log-perplexity and higher MultiBlimp) among our evaluated conditions, compared with (i) a model trained only on the source language, (ii) a model trained on a smaller amount of target-language data, and (iii) the source language model adapted via standard finetuning on the target data. The results suggest tokenizer transfer is a compute-efficient alternative for low-resource LM training: train a monolingual tokenizer for the target language, transfer it to a larger pre-trained model, and fine-tune using the target data."
}Markdown (Informal)
[Efficient Low-Resource Language Models Using Tokenizer Transfer](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.49/) (Gren & Kurfali, EACL 2026)
ACL
- Gustaf Gren and Murathan Kurfali. 2026. Efficient Low-Resource Language Models Using Tokenizer Transfer. In Proceedings of the 19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 4: Student Research Workshop), pages 639–648, Rabat, Morocco. Association for Computational Linguistics.