@inproceedings{saleva-2020-multi,
title = "A Multi-Orthography Parallel Corpus of {Y}iddish Nouns",
author = "Saleva, Jonne",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.119",
pages = "948--952",
abstract = "Yiddish is a low-resource language belonging to the Germanic language family and written using the Hebrew alphabet. As a language, Yiddish can be considered resource-poor as it lacks both public accessible corpora and a widely-used standard orthography, with various countries and organizations influencing the spellings speakers use. While existing corpora of Yiddish text do exist, they are often only written in a single, potentially non-standard orthography, with no parallel version with standard orthography available. In this work, we introduce the first multi-orthography parallel corpus of Yiddish nouns built by scraping word entries from Wiktionary. We also demonstrate how the corpus can be used to bootstrap a transliteration model using the Sequitur-G2P grapheme-to-phoneme conversion toolkit to map between various orthographies. Our trained system achieves error rates between 16.79{\%} and 28.47{\%} on the test set, depending on the orthographies considered. In addition to quantitative analysis, we also conduct qualitative error analysis of the trained system, concluding that non-phonetically spelled Hebrew words are the largest cause of error. We conclude with remarks regarding future work and release the corpus and associated code under a permissive license for the larger community to use.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saleva-2020-multi">
<titleInfo>
<title>A Multi-Orthography Parallel Corpus of Yiddish Nouns</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonne</namePart>
<namePart type="family">Saleva</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Yiddish is a low-resource language belonging to the Germanic language family and written using the Hebrew alphabet. As a language, Yiddish can be considered resource-poor as it lacks both public accessible corpora and a widely-used standard orthography, with various countries and organizations influencing the spellings speakers use. While existing corpora of Yiddish text do exist, they are often only written in a single, potentially non-standard orthography, with no parallel version with standard orthography available. In this work, we introduce the first multi-orthography parallel corpus of Yiddish nouns built by scraping word entries from Wiktionary. We also demonstrate how the corpus can be used to bootstrap a transliteration model using the Sequitur-G2P grapheme-to-phoneme conversion toolkit to map between various orthographies. Our trained system achieves error rates between 16.79% and 28.47% on the test set, depending on the orthographies considered. In addition to quantitative analysis, we also conduct qualitative error analysis of the trained system, concluding that non-phonetically spelled Hebrew words are the largest cause of error. We conclude with remarks regarding future work and release the corpus and associated code under a permissive license for the larger community to use.</abstract>
<identifier type="citekey">saleva-2020-multi</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.119</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>948</start>
<end>952</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A Multi-Orthography Parallel Corpus of Yiddish Nouns
%A Saleva, Jonne
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F saleva-2020-multi
%X Yiddish is a low-resource language belonging to the Germanic language family and written using the Hebrew alphabet. As a language, Yiddish can be considered resource-poor as it lacks both public accessible corpora and a widely-used standard orthography, with various countries and organizations influencing the spellings speakers use. While existing corpora of Yiddish text do exist, they are often only written in a single, potentially non-standard orthography, with no parallel version with standard orthography available. In this work, we introduce the first multi-orthography parallel corpus of Yiddish nouns built by scraping word entries from Wiktionary. We also demonstrate how the corpus can be used to bootstrap a transliteration model using the Sequitur-G2P grapheme-to-phoneme conversion toolkit to map between various orthographies. Our trained system achieves error rates between 16.79% and 28.47% on the test set, depending on the orthographies considered. In addition to quantitative analysis, we also conduct qualitative error analysis of the trained system, concluding that non-phonetically spelled Hebrew words are the largest cause of error. We conclude with remarks regarding future work and release the corpus and associated code under a permissive license for the larger community to use.
%U https://aclanthology.org/2020.lrec-1.119
%P 948-952
Markdown (Informal)
[A Multi-Orthography Parallel Corpus of Yiddish Nouns](https://aclanthology.org/2020.lrec-1.119) (Saleva, LREC 2020)
ACL