@inproceedings{dadason-etal-2020-kvistur,
title = "Kvistur 2.0: a {B}i{LSTM} Compound Splitter for {I}celandic",
author = "Da{\dh}ason, J{\'o}n and
Mollberg, David and
Loftsson, Hrafn and
Bjarnad{\'o}ttir, Krist{\'\i}n",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.492",
pages = "3991--3995",
abstract = "In this paper, we present a character-based BiLSTM model for splitting Icelandic compound words, and show how varying amounts of training data affects the performance of the model. Compounding is highly productive in Icelandic, and new compounds are constantly being created. This results in a large number of out-of-vocabulary (OOV) words, negatively impacting the performance of many NLP tools. Our model is trained on a dataset of 2.9 million unique word forms and their constituent structures from the Database of Icelandic Morphology. The model learns how to split compound words into two parts and can be used to derive the constituent structure of any word form. Knowing the constituent structure of a word form makes it possible to generate the optimal split for a given task, e.g., a full split for subword tokenization, or, in the case of part-of-speech tagging, splitting an OOV word until the largest known morphological head is found. The model outperforms other previously published methods when evaluated on a corpus of manually split word forms. This method has been integrated into Kvistur, an Icelandic compound word analyzer.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="dadason-etal-2020-kvistur">
<titleInfo>
<title>Kvistur 2.0: a BiLSTM Compound Splitter for Icelandic</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jón</namePart>
<namePart type="family">Da\dhason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">David</namePart>
<namePart type="family">Mollberg</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hrafn</namePart>
<namePart type="family">Loftsson</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Kristín</namePart>
<namePart type="family">Bjarnadóttir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>In this paper, we present a character-based BiLSTM model for splitting Icelandic compound words, and show how varying amounts of training data affects the performance of the model. Compounding is highly productive in Icelandic, and new compounds are constantly being created. This results in a large number of out-of-vocabulary (OOV) words, negatively impacting the performance of many NLP tools. Our model is trained on a dataset of 2.9 million unique word forms and their constituent structures from the Database of Icelandic Morphology. The model learns how to split compound words into two parts and can be used to derive the constituent structure of any word form. Knowing the constituent structure of a word form makes it possible to generate the optimal split for a given task, e.g., a full split for subword tokenization, or, in the case of part-of-speech tagging, splitting an OOV word until the largest known morphological head is found. The model outperforms other previously published methods when evaluated on a corpus of manually split word forms. This method has been integrated into Kvistur, an Icelandic compound word analyzer.</abstract>
<identifier type="citekey">dadason-etal-2020-kvistur</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.492</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>3991</start>
<end>3995</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Kvistur 2.0: a BiLSTM Compound Splitter for Icelandic
%A Da\dhason, Jón
%A Mollberg, David
%A Loftsson, Hrafn
%A Bjarnadóttir, Kristín
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F dadason-etal-2020-kvistur
%X In this paper, we present a character-based BiLSTM model for splitting Icelandic compound words, and show how varying amounts of training data affects the performance of the model. Compounding is highly productive in Icelandic, and new compounds are constantly being created. This results in a large number of out-of-vocabulary (OOV) words, negatively impacting the performance of many NLP tools. Our model is trained on a dataset of 2.9 million unique word forms and their constituent structures from the Database of Icelandic Morphology. The model learns how to split compound words into two parts and can be used to derive the constituent structure of any word form. Knowing the constituent structure of a word form makes it possible to generate the optimal split for a given task, e.g., a full split for subword tokenization, or, in the case of part-of-speech tagging, splitting an OOV word until the largest known morphological head is found. The model outperforms other previously published methods when evaluated on a corpus of manually split word forms. This method has been integrated into Kvistur, an Icelandic compound word analyzer.
%U https://aclanthology.org/2020.lrec-1.492
%P 3991-3995
Markdown (Informal)
[Kvistur 2.0: a BiLSTM Compound Splitter for Icelandic](https://aclanthology.org/2020.lrec-1.492) (Daðason et al., LREC 2020)
ACL
- Jón Daðason, David Mollberg, Hrafn Loftsson, and Kristín Bjarnadóttir. 2020. Kvistur 2.0: a BiLSTM Compound Splitter for Icelandic. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 3991–3995, Marseille, France. European Language Resources Association.