@inproceedings{pranjic-etal-2024-llmsegm,
title = "{LLMS}egm: Surface-level Morphological Segmentation Using Large Language Model",
author = "Pranji{\'c}, Marko and
Robnik-{\v{S}}ikonja, Marko and
Pollak, Senja",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.933/",
pages = "10665--10674",
abstract = "Morphological word segmentation splits a given word into its morphemes (roots and affixes), the smallest meaning-bearing units of language. We introduce a novel approach, called LLMSegm, to surface-level morphological segmentation leveraging large language models (LLMs). The proposed approach is applicable in low-data settings as well as for low-resourced languages. We show how to transform the surface-level morphological segmentation task to a binary classification problem and train LLMs to solve it efficiently. For input, we leverage the information from the default LLM subword tokenisation, and a custom morphological segmentation using novel encoding. The evaluation of LLMSegm across seven morphologically diverse languages demonstrates substantial gains in minimally-supervised settings as well as for low-resourced languages, compared to several existing competitive approaches. In terms of F1-scores and accuracy, we achieve improved results compared to the competing methods in six out of seven datasets. Keywords: morphological segmentation, surface-level segmentation, large language models, low-resource settings"
}
Markdown (Informal)
[LLMSegm: Surface-level Morphological Segmentation Using Large Language Model](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.933/) (Pranjić et al., LREC-COLING 2024)
ACL