@inproceedings{jiang-etal-2026-tokenizer,
title = "Tokenizer-Aware Cross-Lingual Adaptation of Decoder-Only {LLM}s through Embedding Relearning and Swapping",
author = "Jiang, Fan and
Yu, Honglin and
Chung, Grace Y and
Cohn, Trevor",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.357/",
pages = "7606--7636",
ISBN = "979-8-89176-380-7",
abstract = "Extending Large Language Models (LLMs) to new languages is challenging, with most methods proposed suffering from high computational cost and catastrophic forgetting of original model capabilities. Embedding relearning{~}(CITATION), a technique that creates new tokenizers and tunes embeddings on fixed model weights for target language adaptation, is both light-weight and performant. However, it has only been shown to work for older generation encoder-only models and for high resource languages. In this paper, we extend this framework to decoder-only LLMs focusing on joint adaptation to many languages, including low-resource ones. We experiment in three language groups over 100 languages each. We adapt a pre-trained LLM via switching to a customized tokenizer, and relearning the embedding layer. Across 96 diverse languages spanning both classification and generation tasks, we show embedding relearning improves models by up to 20{\%}, being highly competitive with full-weight updating baselines while vastly more computationally efficient and mitigating catastrophic forgetting. This translates into better results in transferring the improved multilingual performance to tasks that build on core English abilities (e.g., multilingual math reasoning), compared to various baselines. Further analysis reveals the critical role of customizing tokenizers in achieving effective language transfer, particularly for non-Latin script languages."
}Markdown (Informal)
[Tokenizer-Aware Cross-Lingual Adaptation of Decoder-Only LLMs through Embedding Relearning and Swapping](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.357/) (Jiang et al., EACL 2026)
ACL