@inproceedings{hofmann-etal-2022-embarrassingly,
title = "An Embarrassingly Simple Method to Mitigate Undesirable Properties of Pretrained Language Model Tokenizers",
author = "Hofmann, Valentin and
Schuetze, Hinrich and
Pierrehumbert, Janet",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.acl-short.43/",
doi = "10.18653/v1/2022.acl-short.43",
pages = "385--393",
abstract = "We introduce FLOTA (Few Longest Token Approximation), a simple yet effective method to improve the tokenization of pretrained language models (PLMs). FLOTA uses the vocabulary of a standard tokenizer but tries to preserve the morphological structure of words during tokenization. We evaluate FLOTA on morphological gold segmentations as well as a text classification task, using BERT, GPT-2, and XLNet as example PLMs. FLOTA leads to performance gains, makes inference more efficient, and enhances the robustness of PLMs with respect to whitespace noise."
}
Markdown (Informal)
[An Embarrassingly Simple Method to Mitigate Undesirable Properties of Pretrained Language Model Tokenizers](https://preview.aclanthology.org/fix-sig-urls/2022.acl-short.43/) (Hofmann et al., ACL 2022)
ACL