@inproceedings{sigurdardottir-etal-2021-creating,
title = "Creating Data in {I}celandic for Text Normalization",
author = {Sigur{\dh}ard{\'o}ttir, Helga Svala and
Nikul{\'a}sd{\'o}ttir, Anna Bj{\"o}rk and
Gu{\dh}nason, J{\'o}n},
booktitle = "Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)",
month = may # " 31--2 " # jun,
year = "2021",
address = "Reykjavik, Iceland (Online)",
publisher = {Link{\"o}ping University Electronic Press, Sweden},
url = "https://aclanthology.org/2021.nodalida-main.45",
pages = "404--412",
abstract = "There is no natural way to acquire normalized data so we try to create good enough data to attempt more advanced methods for text normalization. We manually annotated the first normalized corpus in Icelandic, 40,000 sentences, and developed Reg{\'\i}na, a rule-based system for text normalization. Reg{\'\i}na gets 90.83{\%} accuracy compared to the manually annotated corpus on non-standard words. Reg{\'\i}na showed a significant improvement in accuracy when compared to an older normalization system for Icelandic. The normalized corpus and Reg{\'\i}na will be released as open source.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="sigurdardottir-etal-2021-creating">
<titleInfo>
<title>Creating Data in Icelandic for Text Normalization</title>
</titleInfo>
<name type="personal">
<namePart type="given">Helga</namePart>
<namePart type="given">Svala</namePart>
<namePart type="family">Sigur\dhardóttir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Anna</namePart>
<namePart type="given">Björk</namePart>
<namePart type="family">Nikulásdóttir</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Jón</namePart>
<namePart type="family">Gu\dhnason</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2021-may" 31–2 "jun</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)</title>
</titleInfo>
<originInfo>
<publisher>Linköping University Electronic Press, Sweden</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>There is no natural way to acquire normalized data so we try to create good enough data to attempt more advanced methods for text normalization. We manually annotated the first normalized corpus in Icelandic, 40,000 sentences, and developed Regína, a rule-based system for text normalization. Regína gets 90.83% accuracy compared to the manually annotated corpus on non-standard words. Regína showed a significant improvement in accuracy when compared to an older normalization system for Icelandic. The normalized corpus and Regína will be released as open source.</abstract>
<identifier type="citekey">sigurdardottir-etal-2021-creating</identifier>
<location>
<url>https://aclanthology.org/2021.nodalida-main.45</url>
</location>
<part>
<date>2021-may" 31–2 "jun</date>
<extent unit="page">
<start>404</start>
<end>412</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Creating Data in Icelandic for Text Normalization
%A Sigur\dhardóttir, Helga Svala
%A Nikulásdóttir, Anna Björk
%A Gu\dhnason, Jón
%S Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa)
%D 2021
%8 may" 31–2 "jun
%I Linköping University Electronic Press, Sweden
%C Reykjavik, Iceland (Online)
%F sigurdardottir-etal-2021-creating
%X There is no natural way to acquire normalized data so we try to create good enough data to attempt more advanced methods for text normalization. We manually annotated the first normalized corpus in Icelandic, 40,000 sentences, and developed Regína, a rule-based system for text normalization. Regína gets 90.83% accuracy compared to the manually annotated corpus on non-standard words. Regína showed a significant improvement in accuracy when compared to an older normalization system for Icelandic. The normalized corpus and Regína will be released as open source.
%U https://aclanthology.org/2021.nodalida-main.45
%P 404-412
Markdown (Informal)
[Creating Data in Icelandic for Text Normalization](https://aclanthology.org/2021.nodalida-main.45) (Sigurðardóttir et al., NoDaLiDa 2021)
ACL
- Helga Svala Sigurðardóttir, Anna Björk Nikulásdóttir, and Jón Guðnason. 2021. Creating Data in Icelandic for Text Normalization. In Proceedings of the 23rd Nordic Conference on Computational Linguistics (NoDaLiDa), pages 404–412, Reykjavik, Iceland (Online). Linköping University Electronic Press, Sweden.