@inproceedings{mager-etal-2020-tackling,
title = "Tackling the Low-resource Challenge for Canonical Segmentation",
author = {Mager, Manuel and
{\c{C}}etino{\u{g}}lu, {\"O}zlem and
Kann, Katharina},
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.423/",
doi = "10.18653/v1/2020.emnlp-main.423",
pages = "5237--5250",
abstract = "Canonical morphological segmentation consists of dividing words into their standardized morphemes. Here, we are interested in approaches for the task when training data is limited. We compare model performance in a simulated low-resource setting for the high-resource languages German, English, and Indonesian to experiments on new datasets for the truly low-resource languages Popoluca and Tepehua. We explore two new models for the task, borrowing from the closely related area of morphological generation: an LSTM pointer-generator and a sequence-to-sequence model with hard monotonic attention trained with imitation learning. We find that, in the low-resource setting, the novel approaches out-perform existing ones on all languages by up to 11.4{\%} accuracy. However, while accuracy in emulated low-resource scenarios is over 50{\%} for all languages, for the truly low-resource languages Popoluca and Tepehua, our best model only obtains 37.4{\%} and 28.4{\%} accuracy, respectively. Thus, we conclude that canonical segmentation is still a challenging task for low-resource languages."
}
Markdown (Informal)
[Tackling the Low-resource Challenge for Canonical Segmentation](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.423/) (Mager et al., EMNLP 2020)
ACL