@inproceedings{hiraoka-2026-corpus,
title = "Corpus-Dependent Subcharacter Encoding via {HMM}-Guided Code Assignment",
author = "Hiraoka, Tatsuya",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1596/",
pages = "34569--34593",
ISBN = "979-8-89176-390-6",
abstract = "We propose a corpus-dependent alternative to byte encoding that learns fixed-length atomic codes for characters directly from text, which we refer to as Latom (Learned Atom-based Encoding).We instantiate this framework by training an HMM on N-repeated character sequences to estimate ``atom'' posteriors, followed by a Hungarian assignment yielding a globally optimal one-to-one character-code mapping.Across 14 languages, the encodings improve intrinsic metrics, including token counts after subword tokenization and bigram perplexity, with appropriate code lengths.On Amazon Reviews in six languages, Latom improves text classification accuracy and reduces decoding errors in language model generation.Overall, these results demonstrate that character encodings can be learned from corpus statistics while remaining reversible and compatible with standard tokenization pipelines."
}Markdown (Informal)
[Corpus-Dependent Subcharacter Encoding via HMM-Guided Code Assignment](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1596/) (Hiraoka, ACL 2026)
ACL