@inproceedings{ma-etal-2024-ms2sl,
title = "{MS}2{SL}: Multimodal Spoken Data-Driven Continuous Sign Language Production",
author = "Ma, Jian and
Wang, Wenguan and
Yang, Yi and
Zheng, Feng",
editor = "Ku, Lun-Wei and
Martins, Andre and
Srikumar, Vivek",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2024",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.432/",
doi = "10.18653/v1/2024.findings-acl.432",
pages = "7241--7254",
abstract = "Sign language understanding has made significant strides; however, there is still no viable solution for generating sign sequences directlyfrom entire spoken content, e.g., text or speech. In this paper, we propose a unified framework for continuous sign language production, easing communication between sign and non-sign language users. In particular, a sequence diffusion model, utilizing embeddings extracted from text or speech, is crafted to generate sign predictions step by step. Moreover, by creating a joint embedding space for text, audio, and sign, we bind these modalities and leverage the semantic consistency among them to provide informative feedback for the model training. This embedding-consistency learning strategy minimizes the reliance on sign triplets and ensures continuous model refinement, evenwith a missing audio modality. Experiments on How2Sign and PHOENIX14T datasets demonstrate that our model achieves competitive performance in sign language production."
}
Markdown (Informal)
[MS2SL: Multimodal Spoken Data-Driven Continuous Sign Language Production](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.findings-acl.432/) (Ma et al., Findings 2024)
ACL