@inproceedings{zhao-etal-2023-ccsrd,
title = "{CCSRD}: Content-Centric Speech Representation Disentanglement Learning for End-to-End Speech Translation",
author = "Zhao, Xiaohu and
Sun, Haoran and
Lei, Yikun and
Zhu, Shaolin and
Xiong, Deyi",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.394/",
doi = "10.18653/v1/2023.findings-emnlp.394",
pages = "5920--5932",
abstract = "Deep neural networks have demonstrated their capacity in extracting features from speech inputs. However, these features may include non-linguistic speech factors such as timbre and speaker identity, which are not directly related to translation. In this paper, we propose a content-centric speech representation disentanglement learning framework for speech translation, CCSRD, which decomposes speech representations into content representations and non-linguistic representations via representation disentanglement learning. CCSRD consists of a content encoder that encodes linguistic content information from the speech input, a non-content encoder that models non-linguistic speech features, and a disentanglement module that learns disentangled representations with a cyclic reconstructor, feature reconstructor and speaker classifier trained in a multi-task learning way. Experiments on the MuST-C benchmark dataset demonstrate that CCSRD achieves an average improvement of +0.9 BLEU in two settings across five translation directions over the baseline, outperforming state-of-the-art end-to-end speech translation models and cascaded models."
}
Markdown (Informal)
[CCSRD: Content-Centric Speech Representation Disentanglement Learning for End-to-End Speech Translation](https://preview.aclanthology.org/fix-sig-urls/2023.findings-emnlp.394/) (Zhao et al., Findings 2023)
ACL