@inproceedings{ohta-etal-2022-joeys2t,
title = "{J}oey{S}2{T}: Minimalistic Speech-to-Text Modeling with {J}oey{NMT}",
author = "Ohta, Mayumi and
Kreutzer, Julia and
Riezler, Stefan",
editor = "Che, Wanxiang and
Shutova, Ekaterina",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
month = dec,
year = "2022",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/moar-dois/2022.emnlp-demos.6/",
doi = "10.18653/v1/2022.emnlp-demos.6",
pages = "50--59",
abstract = "JoeyS2T is a JoeyNMT extension for speech-to-text tasks such as automatic speech recognition and end-to-end speech translation. It inherits the core philosophy of JoeyNMT, a minimalist NMT toolkit built on PyTorch, seeking simplicity and accessibility. JoeyS2T{'}s workflow is self-contained, starting from data pre-processing, over model training and prediction to evaluation, and is seamlessly integrated into JoeyNMT{'}s compact and simple code base. On top of JoeyNMT{'}s state-of-the-art Transformer-based Encoder-Decoder architecture, JoeyS2T provides speech-oriented components such as convolutional layers, SpecAugment, CTC-loss, and WER evaluation. Despite its simplicity compared to prior implementations, JoeyS2T performs competitively on English speech recognition and English-to-German speech translation benchmarks. The implementation is accompanied by a walk-through tutorial and available on \url{https://github.com/may-/joeys2t}."
}
Markdown (Informal)
[JoeyS2T: Minimalistic Speech-to-Text Modeling with JoeyNMT](https://preview.aclanthology.org/moar-dois/2022.emnlp-demos.6/) (Ohta et al., EMNLP 2022)
ACL
- Mayumi Ohta, Julia Kreutzer, and Stefan Riezler. 2022. JoeyS2T: Minimalistic Speech-to-Text Modeling with JoeyNMT. In Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing: System Demonstrations, pages 50–59, Abu Dhabi, UAE. Association for Computational Linguistics.