@inproceedings{frunza-2008-trainable,
title = "A Trainable Tokenizer, solution for multilingual texts and compound expression tokenization",
author = "Frunza, Oana",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Tapias, Daniel",
booktitle = "Proceedings of the Sixth International Conference on Language Resources and Evaluation ({LREC}'08)",
month = may,
year = "2008",
address = "Marrakech, Morocco",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/fix-sig-urls/L08-1590/",
abstract = "Tokenization is one of the initial steps done for almost any text processing task. It is not particularly recognized as a challenging task for English monolingual systems but it rapidly increases in complexity for systems that apply it for different languages. This article proposes a supervised learning approach to perform the tokenization task. The method presented in this article is based on character transitions representation, a representation that allows compound expressions to be recognized as a single token. Compound tokens are identified independent of the character that creates the expression. The method automatically learns tokenization rules from a pre-tokenized corpus. The results obtained using the trainable system show that for Romanian and English a statistical significant improvement is obtained over a baseline system that tokenizes texts on every non-alphanumeric character."
}
Markdown (Informal)
[A Trainable Tokenizer, solution for multilingual texts and compound expression tokenization](https://preview.aclanthology.org/fix-sig-urls/L08-1590/) (Frunza, LREC 2008)
ACL