@inproceedings{kratochvil-etal-2020-large,
title = "Large Corpus of {C}zech Parliament Plenary Hearings",
author = "Kratochvil, Jonas and
Polak, Peter and
Bojar, Ondrej",
booktitle = "Proceedings of the 12th Language Resources and Evaluation Conference",
month = may,
year = "2020",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://aclanthology.org/2020.lrec-1.781",
pages = "6363--6367",
abstract = "We present a large corpus of Czech parliament plenary sessions. The corpus consists of approximately 1200 hours of speech data and corresponding text transcriptions. The whole corpus has been segmented to short audio segments making it suitable for both training and evaluation of automatic speech recognition (ASR) systems. The source language of the corpus is Czech, which makes it a valuable resource for future research as only a few public datasets are available in the Czech language. We complement the data release with experiments of two baseline ASR systems trained on the presented data: the more traditional approach implemented in the Kaldi ASRtoolkit which combines hidden Markov models and deep neural networks (NN) and a modern ASR architecture implemented in Jaspertoolkit which uses deep NNs in an end-to-end fashion.",
language = "English",
ISBN = "979-10-95546-34-4",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kratochvil-etal-2020-large">
<titleInfo>
<title>Large Corpus of Czech Parliament Plenary Hearings</title>
</titleInfo>
<name type="personal">
<namePart type="given">Jonas</namePart>
<namePart type="family">Kratochvil</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Peter</namePart>
<namePart type="family">Polak</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ondrej</namePart>
<namePart type="family">Bojar</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<language>
<languageTerm type="text">English</languageTerm>
<languageTerm type="code" authority="iso639-2b">eng</languageTerm>
</language>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 12th Language Resources and Evaluation Conference</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association</publisher>
<place>
<placeTerm type="text">Marseille, France</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
<identifier type="isbn">979-10-95546-34-4</identifier>
</relatedItem>
<abstract>We present a large corpus of Czech parliament plenary sessions. The corpus consists of approximately 1200 hours of speech data and corresponding text transcriptions. The whole corpus has been segmented to short audio segments making it suitable for both training and evaluation of automatic speech recognition (ASR) systems. The source language of the corpus is Czech, which makes it a valuable resource for future research as only a few public datasets are available in the Czech language. We complement the data release with experiments of two baseline ASR systems trained on the presented data: the more traditional approach implemented in the Kaldi ASRtoolkit which combines hidden Markov models and deep neural networks (NN) and a modern ASR architecture implemented in Jaspertoolkit which uses deep NNs in an end-to-end fashion.</abstract>
<identifier type="citekey">kratochvil-etal-2020-large</identifier>
<location>
<url>https://aclanthology.org/2020.lrec-1.781</url>
</location>
<part>
<date>2020-may</date>
<extent unit="page">
<start>6363</start>
<end>6367</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Large Corpus of Czech Parliament Plenary Hearings
%A Kratochvil, Jonas
%A Polak, Peter
%A Bojar, Ondrej
%S Proceedings of the 12th Language Resources and Evaluation Conference
%D 2020
%8 may
%I European Language Resources Association
%C Marseille, France
%@ 979-10-95546-34-4
%G English
%F kratochvil-etal-2020-large
%X We present a large corpus of Czech parliament plenary sessions. The corpus consists of approximately 1200 hours of speech data and corresponding text transcriptions. The whole corpus has been segmented to short audio segments making it suitable for both training and evaluation of automatic speech recognition (ASR) systems. The source language of the corpus is Czech, which makes it a valuable resource for future research as only a few public datasets are available in the Czech language. We complement the data release with experiments of two baseline ASR systems trained on the presented data: the more traditional approach implemented in the Kaldi ASRtoolkit which combines hidden Markov models and deep neural networks (NN) and a modern ASR architecture implemented in Jaspertoolkit which uses deep NNs in an end-to-end fashion.
%U https://aclanthology.org/2020.lrec-1.781
%P 6363-6367
Markdown (Informal)
[Large Corpus of Czech Parliament Plenary Hearings](https://aclanthology.org/2020.lrec-1.781) (Kratochvil et al., LREC 2020)
ACL
- Jonas Kratochvil, Peter Polak, and Ondrej Bojar. 2020. Large Corpus of Czech Parliament Plenary Hearings. In Proceedings of the 12th Language Resources and Evaluation Conference, pages 6363–6367, Marseille, France. European Language Resources Association.