@article{yamaguchi-etal-2026-effectively,
title = "How Can We Effectively Expand the Vocabulary of {LLM}s with 0.01{GB} of Target Language Text?",
author = "Yamaguchi, Atsuki and
Villavicencio, Aline and
Aletras, Nikolaos",
journal = "Computational Linguistics",
volume = "52",
number = "1",
month = mar,
year = "2026",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/ingest-latest-mitpress-cl-tacl/2026.cl-1.9/",
doi = "10.1162/coli.a.581",
pages = "295--330",
abstract = "Large language models (LLMs) have shown remarkable capabilities in many languages beyond English. Yet, LLMs require more inference steps when generating non-English text due to their reliance on English-centric tokenizers and vocabulary, resulting in higher usage costs to non-English speakers. Vocabulary expansion with target language tokens is a widely used cross-lingual vocabulary adaptation approach to remedy this issue. Despite its effectiveness in inference speedup, previous work on vocabulary expansion has focused on high-resource settings assuming access to a substantial amount of target language data to effectively initialize the embeddings of the new tokens and adapt the LLM to the target language. However, vocabulary expansion in low-resource settings has yet to be explored. In this article, we investigate vocabulary expansion in low-resource settings by considering embedding initialization methods and continual pre-training strategies. Through extensive experiments across typologically diverse languages, tasks, and models, we establish a set of strategies to perform vocabulary expansion for faster inference, while striving to maintain competitive downstream performance to baselines. This is achieved with only 30K sentences ({\ensuremath{\sim}}0.01GB text data) from the target language.1"
}Markdown (Informal)
[How Can We Effectively Expand the Vocabulary of LLMs with 0.01GB of Target Language Text?](https://preview.aclanthology.org/ingest-latest-mitpress-cl-tacl/2026.cl-1.9/) (Yamaguchi et al., CL 2026)
ACL