@inproceedings{lakomkin-etal-2018-kt,
title = "{KT}-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from {Y}ou{T}ube Videos",
author = "Lakomkin, Egor and
Magg, Sven and
Weber, Cornelius and
Wermter, Stefan",
editor = "Blanco, Eduardo and
Lu, Wei",
booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = nov,
year = "2018",
address = "Brussels, Belgium",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/D18-2016/",
doi = "10.18653/v1/D18-2016",
pages = "90--95",
abstract = "We describe KT-Speech-Crawler: an approach for automatic dataset construction for speech recognition by crawling YouTube videos. We outline several filtering and post-processing steps, which extract samples that can be used for training end-to-end neural speech recognition systems. In our experiments, we demonstrate that a single-core version of the crawler can obtain around 150 hours of transcribed speech within a day, containing an estimated 3.5{\%} word error rate in the transcriptions. Automatically collected samples contain reading and spontaneous speech recorded in various conditions including background noise and music, distant microphone recordings, and a variety of accents and reverberation. When training a deep neural network on speech recognition, we observed around 40{\%} word error rate reduction on the Wall Street Journal dataset by integrating 200 hours of the collected samples into the training set."
}
Markdown (Informal)
[KT-Speech-Crawler: Automatic Dataset Construction for Speech Recognition from YouTube Videos](https://preview.aclanthology.org/fix-sig-urls/D18-2016/) (Lakomkin et al., EMNLP 2018)
ACL