@article{clark-etal-2022-canine,
title = "Canine: Pre-training an Efficient Tokenization-Free Encoder for Language Representation",
author = "Clark, Jonathan H. and
Garrette, Dan and
Turc, Iulia and
Wieting, John",
editor = "Roark, Brian and
Nenkova, Ani",
journal = "Transactions of the Association for Computational Linguistics",
volume = "10",
year = "2022",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.tacl-1.5/",
doi = "10.1162/tacl_a_00448",
pages = "73--91",
abstract = "Pipelined NLP systems have largely been superseded by end-to-end neural modeling, yet nearly all commonly used models still require an explicit tokenization step. While recent tokenization approaches based on data-derived subword lexicons are less brittle than manually engineered tokenizers, these techniques are not equally suited to all languages, and the use of any fixed vocabulary may limit a model{'}s ability to adapt. In this paper, we present Canine, a neural encoder that operates directly on character sequences{---}without explicit tokenization or vocabulary{---}and a pre-training strategy that operates either directly on characters or optionally uses subwords as a soft inductive bias. To use its finer-grained input effectively and efficiently, Canine combines downsampling, which reduces the input sequence length, with a deep transformer stack, which encodes context. Canine outperforms a comparable mBert model by 5.7 F1 on TyDi QA, a challenging multilingual benchmark, despite having fewer model parameters."
}
Markdown (Informal)
[Canine: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://preview.aclanthology.org/fix-sig-urls/2022.tacl-1.5/) (Clark et al., TACL 2022)
ACL