@inproceedings{han-etal-2023-modality,
title = "Modality Adaption or Regularization? A Case Study on End-to-End Speech Translation",
author = "Han, Yuchen and
Xu, Chen and
Xiao, Tong and
Zhu, Jingbo",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.acl-short.115/",
doi = "10.18653/v1/2023.acl-short.115",
pages = "1340--1348",
abstract = "Pre-training and fine-tuning is a paradigm for alleviating the data scarcity problem in end-to-end speech translation (E2E ST). The commonplace ``modality gap'' between speech and text data often leads to inconsistent inputs between pre-training and fine-tuning. However, we observe that this gap occurs in the early stages of fine-tuning, but does not have a major impact on the final performance. On the other hand, we find that there has another gap, which we call the ``capacity gap'': high resource tasks (such as ASR and MT) always require a large model to fit, when the model is reused for a low resource task (E2E ST), it will get a sub-optimal performance due to the over-fitting. In a case study, we find that the regularization plays a more important role than the well-designed modality adaption method, which achieves 29.0 for en-de and 40.3 for en-fr on the MuST-C dataset."
}
Markdown (Informal)
[Modality Adaption or Regularization? A Case Study on End-to-End Speech Translation](https://preview.aclanthology.org/fix-sig-urls/2023.acl-short.115/) (Han et al., ACL 2023)
ACL