@inproceedings{huang-etal-2026-ted,
title = "Ted-Tok: Maintaining an Evolving Vocabulary for Lifelong Learning",
author = "Huang, Jiameng and
Zhang, Zhi and
He, Zhenyu and
Sun, Jiacheng and
He, Di",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.394/",
pages = "8706--8719",
ISBN = "979-8-89176-390-6",
abstract = "Lifelong learning investigates how models adapt when exposed to a potentially infinite stream of data. Most conventional approaches focus on updating model parameters (i.e., the neural network weights) as the underlying data distribution evolves over time. However, in natural language processing, model parameters are not the only components that matter. The tokenizer, a foundational part of the system, is usually assumed to remain fixed in lifelong learning scenarios. In this work, we challenge the validity of this assumption: as language evolves, a static tokenizer fragments newly emerging lexical items, reducing compression efficiency and consequently degrading the model performance. We introduce the Temporal Drift Tokenizer (Ted-Tok), which maintains an evolving vocabulary that adapts to emerging linguistic patterns over time. This adaptivity is driven by time-weighted frequency estimators that smooth short-term fluctuations to capture persistent linguistic trends, and a principled addition-deletion strategy targeting sink tokens. Across multiple domains, Ted-Tok consistently improves compression and task performance, with gains increasing under stronger drift, underscoring the role of tokenizer adaptivity in lifelong learning."
}Markdown (Informal)
[Ted-Tok: Maintaining an Evolving Vocabulary for Lifelong Learning](https://preview.aclanthology.org/ingest-acl/2026.acl-long.394/) (Huang et al., ACL 2026)
ACL
- Jiameng Huang, Zhi Zhang, Zhenyu He, Jiacheng Sun, and Di He. 2026. Ted-Tok: Maintaining an Evolving Vocabulary for Lifelong Learning. In Proceedings of the 64th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 8706–8719, San Diego, California, United States. Association for Computational Linguistics.