@inproceedings{lane-bird-2020-bootstrapping,
title = "Bootstrapping Techniques for Polysynthetic Morphological Analysis",
author = "Lane, William and
Bird, Steven",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.594/",
doi = "10.18653/v1/2020.acl-main.594",
pages = "6652--6661",
abstract = "Polysynthetic languages have exceptionally large and sparse vocabularies, thanks to the number of morpheme slots and combinations in a word. This complexity, together with a general scarcity of written data, poses a challenge to the development of natural language technologies. To address this challenge, we offer linguistically-informed approaches for bootstrapping a neural morphological analyzer, and demonstrate its application to Kunwinjku, a polysynthetic Australian language. We generate data from a finite state transducer to train an encoder-decoder model. We improve the model by ``hallucinating'' missing linguistic structure into the training data, and by resampling from a Zipf distribution to simulate a more natural distribution of morphemes. The best model accounts for all instances of reduplication in the test set and achieves an accuracy of 94.7{\%} overall, a 10 percentage point improvement over the FST baseline. This process demonstrates the feasibility of bootstrapping a neural morph analyzer from minimal resources."
}
Markdown (Informal)
[Bootstrapping Techniques for Polysynthetic Morphological Analysis](https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.594/) (Lane & Bird, ACL 2020)
ACL