@inproceedings{kiyono-etal-2023-bridging,
title = "Bridging the Gap between Subword and Character Segmentation in Pretrained Language Models",
author = "Kiyono, Shun and
Takase, Sho and
Li, Shengzhe and
Sato, Toshinori",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.ranlp-1.62/",
pages = "568--577",
abstract = "Pretrained language models require the use of consistent segmentation (e.g., subword- or character-level segmentation) in pretraining and finetuning. In NLP, many tasks are modeled by subword-level segmentation better than by character-level segmentation. However, because of their format, several tasks require the use of character-level segmentation. Thus, in order to tackle both types of NLP tasks, language models must be independently pretrained for both subword and character-level segmentation. However, this is an inefficient and costly procedure. Instead, this paper proposes a method for training a language model with unified segmentation. This means that the trained model can be finetuned on both subword- and character-level segmentation. The principle of the method is to apply the subword regularization technique to generate a mixture of subword- and character-level segmentation. Through experiment on BERT models, we demonstrate that our method can halve the computational cost of pretraining."
}
Markdown (Informal)
[Bridging the Gap between Subword and Character Segmentation in Pretrained Language Models](https://preview.aclanthology.org/fix-sig-urls/2023.ranlp-1.62/) (Kiyono et al., RANLP 2023)
ACL