@article{pranjic-etal-2026-slovene,
title = "{S}lovene Morphological and Word Formation Segmentation: A Novel Dataset and Evaluation",
author = "Pranji{\'c}, Marko and
Kern, Boris and
Vor{\v{s}}i{\v{c}}, Ines and
Pollak, Senja",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.140/",
pages = "1781--1793",
abstract = "We introduce the first publicly available manually annotated dataset for morphological segmentation and word-formation analysis for Slovene, containing 1,935 words annotated by two domain experts. The dataset provides three types of linguistic information: morphological and word-formation segments with zero-morpheme and simplex annotations. We present a four-stage annotation approach achieving inter-annotator agreement of 86.80{\%} Krippendorff{'}s Alpha for morphological segmentation and 85.16{\%} for word-formation segments. Computational validation using a morphological segmentation model achieves 87.78{\%} BPR F1 on morphological segmentation and 83.05{\%} on word-formation segments. Despite being smaller than previous datasets derived from non-public esources, our dataset enables high performance and supports reproducible research for morphological analysis tools for Slovene."
}Markdown (Informal)
[Slovene Morphological and Word Formation Segmentation: A Novel Dataset and Evaluation](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.140/) (Pranjić et al., LREC 2026)
ACL