@inproceedings{li-etal-2025-embyte,
title = "{E}m{B}yte: Decomposition and Compression Learning for Small yet Private {NLP}",
author = "Li, Shenglan and
Xu, Jia and
Zhang, Mengjiao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.379/",
doi = "10.18653/v1/2025.findings-emnlp.379",
pages = "7182--7201",
ISBN = "979-8-89176-335-7",
abstract = "Recent breakthroughs in natural language processing (NLP) have come with escalating model sizes and computational costs, posing significant challenges for deployment in real-time and resource-constrained environments. We introduce EMBYTE, a novel byte-level tokenization model that achieves substantial embedding compression while preserving NLP accuracy and enhancing privacy. At the core of EMBYTE is a new Decompose-and-Compress (DeComp) learning strategy that decomposes subwords into fine-grained byte embeddings and then compresses them via neural projection. DeComp enables EMBYTE to be shrunk down to any vocabulary size (e.g., 128 or 256), drastically reducing embedding parameter count by up to 94{\%} compared to subword-based models without increasing sequence length or degrading performance. Moreover, EMBYTE is resilient to privacy threats such as gradient inversion attacks, due to its byte-level many-to-one mapping structure. Empirical results on GLUE, machine translation, sentiment analysis, and language modeling tasks show that EMBYTE matches or surpasses the performance of significantly larger models, while offering improved efficiency. This makes EMBYTE a lightweight and generalizable NLP solution, well-suited for deployment in privacy-sensitive or low-resource environments."
}Markdown (Informal)
[EmByte: Decomposition and Compression Learning for Small yet Private NLP](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.379/) (Li et al., Findings 2025)
ACL