@inproceedings{yuan-etal-2022-data,
title = "Data Augmentation for the Post-Stroke Speech Transcription ({PSST}) Challenge: Sometimes Less Is More",
author = "Yuan, Jiahong and
Cai, Xingyu and
Church, Kenneth",
editor = "Kokkinakis, Dimitrios and
Themistocleous, Charalambos K. and
Fors, Kristina Lundholm and
Tsanas, Athanasios and
Fraser, Kathleen C.",
booktitle = "Proceedings of the RaPID Workshop - Resources and ProcessIng of linguistic, para-linguistic and extra-linguistic Data from people with various forms of cognitive/psychiatric/developmental impairments - within the 13th Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2022.rapid-1.9/",
pages = "71--79",
abstract = "We employ the method of fine-tuning wav2vec2.0 for recognition of phonemes in aphasic speech. Our effort focuses on data augmentation, by supplementing data from both in-domain and out-of-domain datasets for training. We found that although a modest amount of out-of-domain data may be helpful, the performance of the model degrades significantly when the amount of out-of-domain data is much larger than in-domain data. Our hypothesis is that fine-tuning wav2vec2.0 with a CTC loss not only learns bottom-up acoustic properties but also top-down constraints. Therefore, out-of-domain data augmentation is likely to degrade performance if there is a language model mismatch between {\textquotedblleft}in{\textquotedblright} and {\textquotedblleft}out{\textquotedblright} domains. For in-domain audio without ground truth labels, we found that it is beneficial to exclude samples with less confident pseudo labels. Our final model achieves 16.7{\%} PER (phoneme error rate) on the validation set, without using a language model for decoding. The result represents a relative error reduction of 14{\%} over the baseline model trained without data augmentation. Finally, we found that {\textquotedblleft}canonicalized{\textquotedblright} phonemes are much easier to recognize than manually transcribed phonemes."
}
Markdown (Informal)
[Data Augmentation for the Post-Stroke Speech Transcription (PSST) Challenge: Sometimes Less Is More](https://preview.aclanthology.org/add-emnlp-2024-awards/2022.rapid-1.9/) (Yuan et al., RaPID 2022)
ACL