@inproceedings{mo-etal-2025-improving,
title = "Improving Sign Language Understanding with a Multi-Stream Masked Autoencoder Trained on {ASL} Videos",
author = "Mo, Junwen and
Vo, MinhDuc and
Nakayama, Hideki",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.66/",
pages = "1202--1218",
ISBN = "979-8-89176-298-5",
abstract = "Sign language understanding remains a significant challenge, particularly for low-resource sign languages with limited annotated data. Motivated by the success of large-scale pretraining in deep learning, we propose Multi-Stream Masked Autoencoder (MS-MAE) {---} a simple yet effective framework for learning sign language representations from skeleton-based video data. We pretrained a model with MS-MAE on the YouTube-ASL dataset, and then adapted it to multiple downstream tasks across different sign languages. Experimental results show that MS-MAE achieves competitive or superior performance on a range of isolated sign language recognition benchmarks and gloss-free sign language translation tasks across several sign languages. These findings highlight the potential of leveraging large-scale, high-resource sign language data to boost performance in low-resource sign language scenarios. Additionally, visualization of the model{'}s attention maps reveals its ability to cluster adjacent pose sequences within a sentence, some of which align with individual signs, offering insights into the mechanisms underlying successful transfer learning."
}Markdown (Informal)
[Improving Sign Language Understanding with a Multi-Stream Masked Autoencoder Trained on ASL Videos](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.66/) (Mo et al., IJCNLP-AACL 2025)
ACL