@inproceedings{moon-okazaki-2020-jamo,
title = "Jamo Pair Encoding: Subcharacter Representation-based Extreme {K}orean Vocabulary Compression for Efficient Subword Tokenization",
author = "Moon, Sangwhan and
Okazaki, Naoaki",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Twelfth Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.lrec-1.429/",
pages = "3490--3497",
language = "eng",
ISBN = "979-10-95546-34-4",
abstract = "In the context of multilingual language model pre-training, vocabulary size for languages with a broad set of potential characters is an unsolved problem. We propose two algorithms applicable in any unsupervised multilingual pre-training task, increasing the elasticity of budget required for building the vocabulary in Byte-Pair Encoding inspired tokenizers, significantly reducing the cost of supporting Korean in a multilingual model."
}
Markdown (Informal)
[Jamo Pair Encoding: Subcharacter Representation-based Extreme Korean Vocabulary Compression for Efficient Subword Tokenization](https://preview.aclanthology.org/fix-sig-urls/2020.lrec-1.429/) (Moon & Okazaki, LREC 2020)
ACL