@inproceedings{diewald-2022-matrix,
title = "Matrix and Double-Array Representations for Efficient Finite State Tokenization",
author = "Diewald, Nils",
editor = {Banski, Piotr and
Barbaresi, Adrien and
Clematide, Simon and
Kupietz, Marc and
L{\"u}ngen, Harald},
booktitle = "Proceedings of the Workshop on Challenges in the Management of Large Corpora (CMLC-10)",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.cmlc-1.4/",
pages = "20--26",
abstract = "This paper presents an algorithm and implementation for efficient tokenization of space-delimited languages based on a deterministic finite state automaton. Two representations of the underlying data structure are presented and a model implementation for German is compared with state-of-the-art approaches. The presented solution is faster than other tools while maintaining comparable quality."
}
Markdown (Informal)
[Matrix and Double-Array Representations for Efficient Finite State Tokenization](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.cmlc-1.4/) (Diewald, CMLC 2022)
ACL