@inproceedings{abate-etal-2020-large-vocabulary,
title = "Large Vocabulary Read Speech Corpora for Four {E}thiopian Languages: {A}mharic, {T}igrigna, {O}romo and {W}olaytta",
author = "Abate, Solomon Teferra and
Tachbelie, Martha Yifiru and
Melese, Michael and
Abera, Hafte and
Abebe, Tewodros and
Mulugeta, Wondwossen and
Assabie, Yaregal and
Meshesha, Million and
Afnafu, Solomon and
Seyoum, Binyam Ephrem",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.513",
pages = "4167--4171",
abstract = "Automatic Speech Recognition (ASR) is one of the most important technologies to support spoken communication in modern life. However, its development benefits from large speech corpus. The development of such a corpus is expensive and most of the human languages, including the Ethiopian languages, do not have such resources. To address this problem, we have developed four large (about 22 hours) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo and Wolaytta. To assess usability of the corpora for (the purpose of) speech processing, we have developed ASR systems for each language. In this paper, we present the corpora and the baseline ASR systems we have developed. We have achieved word error rates (WERs) of 37.65{\%}, 31.03{\%}, 38.02{\%}, 33.89{\%} for Amharic, Tigrigna, Oromo and Wolaytta, respectively. This results show that the corpora are suitable for further investigation towards the development of ASR systems. Thus, the research community can use the corpora to further improve speech processing systems. From our results, it is clear that the collection of text corpora to train strong language models for all of the languages is still required, especially for Oromo and Wolaytta.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="abate-etal-2020-large-vocabulary">
<titleInfo>
<title>Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo and Wolaytta</title>
</titleInfo>
<name type="personal">
<namePart type="given">Solomon</namePart>
<namePart type="given">Teferra</namePart>
<namePart type="family">Abate</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Martha</namePart>
<namePart type="given">Yifiru</namePart>
<namePart type="family">Tachbelie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Michael</namePart>
<namePart type="family">Melese</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Hafte</namePart>
<namePart type="family">Abera</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Tewodros</namePart>
<namePart type="family">Abebe</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Wondwossen</namePart>
<namePart type="family">Mulugeta</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Yaregal</namePart>
<namePart type="family">Assabie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Million</namePart>
<namePart type="family">Meshesha</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Solomon</namePart>
<namePart type="family">Afnafu</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Binyam</namePart>
<namePart type="given">Ephrem</namePart>
<namePart type="family">Seyoum</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>Automatic Speech Recognition (ASR) is one of the most important technologies to support spoken communication in modern life. However, its development benefits from large speech corpus. The development of such a corpus is expensive and most of the human languages, including the Ethiopian languages, do not have such resources. To address this problem, we have developed four large (about 22 hours) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo and Wolaytta. To assess usability of the corpora for (the purpose of) speech processing, we have developed ASR systems for each language. In this paper, we present the corpora and the baseline ASR systems we have developed. We have achieved word error rates (WERs) of 37.65%, 31.03%, 38.02%, 33.89% for Amharic, Tigrigna, Oromo and Wolaytta, respectively. This results show that the corpora are suitable for further investigation towards the development of ASR systems. Thus, the research community can use the corpora to further improve speech processing systems. From our results, it is clear that the collection of text corpora to train strong language models for all of the languages is still required, especially for Oromo and Wolaytta.</abstract>
<identifier type="citekey">abate-etal-2020-large-vocabulary</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.513</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>4167</start>
<end>4171</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo and Wolaytta
%A Abate, Solomon Teferra
%A Tachbelie, Martha Yifiru
%A Melese, Michael
%A Abera, Hafte
%A Abebe, Tewodros
%A Mulugeta, Wondwossen
%A Assabie, Yaregal
%A Meshesha, Million
%A Afnafu, Solomon
%A Seyoum, Binyam Ephrem
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F abate-etal-2020-large-vocabulary
%X Automatic Speech Recognition (ASR) is one of the most important technologies to support spoken communication in modern life. However, its development benefits from large speech corpus. The development of such a corpus is expensive and most of the human languages, including the Ethiopian languages, do not have such resources. To address this problem, we have developed four large (about 22 hours) speech corpora for four Ethiopian languages: Amharic, Tigrigna, Oromo and Wolaytta. To assess usability of the corpora for (the purpose of) speech processing, we have developed ASR systems for each language. In this paper, we present the corpora and the baseline ASR systems we have developed. We have achieved word error rates (WERs) of 37.65%, 31.03%, 38.02%, 33.89% for Amharic, Tigrigna, Oromo and Wolaytta, respectively. This results show that the corpora are suitable for further investigation towards the development of ASR systems. Thus, the research community can use the corpora to further improve speech processing systems. From our results, it is clear that the collection of text corpora to train strong language models for all of the languages is still required, especially for Oromo and Wolaytta.
%U https://aclanthology.org/2020.lrec-1.513
%P 4167-4171
Markdown (Informal)
[Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo and Wolaytta](https://aclanthology.org/2020.lrec-1.513) (Abate et al., LREC 2020)
ACL
- Solomon Teferra Abate, Martha Yifiru Tachbelie, Michael Melese, Hafte Abera, Tewodros Abebe, Wondwossen Mulugeta, Yaregal Assabie, Million Meshesha, Solomon Afnafu, and Binyam Ephrem Seyoum. 2020. Large Vocabulary Read Speech Corpora for Four Ethiopian Languages: Amharic, Tigrigna, Oromo and Wolaytta. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 4167–4171, Marseille, France. European Language Resources Association.