@inproceedings{iwamoto-kanayama-2024-llm-neologism,
title = "{LLM} Neologism: Emergence of Mutated Characters due to Byte Encoding",
author = "Iwamoto, Ran and
Kanayama, Hiroshi",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.inlg-main.3/",
pages = "24--29",
abstract = "The process of language generation, which selects the most probable tokens one by one, may intrinsically result in output strings that humans never utter. We name this phenomenon {\textquotedblleft}LLM neologism{\textquotedblright} and investigate it focusing on Japanese, Chinese, and Korean languages, where tokens can be smaller than characters. Our findings show that LLM neologism occurs through the combination of two high-frequency words with common tokens. We also clarify the cause of LLM neologism in the tokenization process with limited vocabularies. The results of this study provides important clues for better encoding of multibyte characters, aiming to prevent catastrophic results in AI-generated documents."
}
Markdown (Informal)
[LLM Neologism: Emergence of Mutated Characters due to Byte Encoding](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.inlg-main.3/) (Iwamoto & Kanayama, INLG 2024)
ACL