@inproceedings{boesenberg-evang-2026-trainable,
title = "Trainable, Multiword-aware Linguistic Tokenization Using Modern Neural Networks",
author = "Boesenberg, Clara and
Evang, Kilian",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.19/",
pages = "266--276",
ISBN = "979-8-89176-383-8",
abstract = "We revisit MWE-aware linguistic tokenization as a character-level and token-level sequence labeling problem and present a systematic evaluation on English, German, Italian, and Dutch data. We compare a standard tokenizer trained without MWE-awareness as a baseline (UDPipe), a character-level SRN+CRF model (Elephant), and transformer-based MaChAmp models trained either directly on gold character labels or as token-level postprocessors on top of UDPipe. Our results show that the two-stage pipeline {--} UDPipe pretokenization followed by MaChAmp postprocessing {--} consistently yields the best accuracy. Our analysis of error patterns highlights how different architectures trade off over- and undersegmentation. These findings provide practical guidance for building MWE-aware tokenizers and suggest that postprocessing pipelines with transformers are a strong and general strategy for non-standard tokenization."
}Markdown (Informal)
[Trainable, Multiword-aware Linguistic Tokenization Using Modern Neural Networks](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.19/) (Boesenberg & Evang, EACL 2026)
ACL