@article{brans-bloem-2026-multi,
title = "Multi-{S}im{L}ex for {D}utch: Benchmarking Embedding- and Prompt-Based Model Performance on Semantic Similarity",
author = "Brans, Lizzy and
Bloem, Jelke",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.380/",
pages = "4846--4860",
abstract = "We introduce Dutch Multi-SimLex, a 1,888{--}pair extension of the Multi-SimLex benchmark for evaluating lexical semantic similarity in Dutch. The dataset was rated by 100 native speakers on a 0{--}6 scale and shows high reliability (overall ICC(2,k)=0.82) as well as strong alignment with English ({\ensuremath{\rho}}=0.73). Using this resource, we evaluate eighteen models across four architectural families: static embeddings, encoder-only transformers, encoder{--}decoders, and decoder-only LLMs. We evaluate models using two complementary approaches: embedding-based cosine similarity and prompted similarity judgments in Dutch. In embedding-based evaluation, FastText ({\ensuremath{\rho}}=0.485) and the monolingual Dutch encoder BERTje ({\ensuremath{\rho}}=0.468) achieve the strongest alignment with human ratings, while multilingual encoders such as mBERT ({\ensuremath{\rho}}=0.208) and XLM-R ({\ensuremath{\rho}}=0.186) perform weaker. Prompt-based evaluation yields substantially higher correlations, with GPT-4 ({\ensuremath{\rho}}=0.761) performing best, followed by DeepSeek-V3 ({\ensuremath{\rho}}=0.753) and Gemini 1.5 Pro ({\ensuremath{\rho}}=0.722). Together, the results show that model performance depends strongly on how meaning is tested. Dutch Multi-SimLex provides a reliable foundation for evaluating meaning across architectures and advancing Dutch semantic evaluation."
}Markdown (Informal)
[Multi-SimLex for Dutch: Benchmarking Embedding- and Prompt-Based Model Performance on Semantic Similarity](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.380/) (Brans & Bloem, LREC 2026)
ACL