@inproceedings{ortiz-pado-2025-low,
title = "Low-Resource Sign Language Glossing Profits From Data Augmentation",
author = "Ortiz, Diana Vania Lara and
Pad{\'o}, Sebastian",
editor = "Hasanuzzaman, Mohammed and
Quiroga, Facundo Manuel and
Modi, Ashutosh and
Kamila, Sabyasachi and
Artiaga, Keren and
Joshi, Abhinav and
Singh, Sanjeet",
booktitle = "Proceedings of the Workshop on Sign Language Processing (WSLP)",
month = dec,
year = "2025",
address = "IIT Bombay, Mumbai, India (Co-located with IJCNLP{--}AACL 2025)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.wslp-main.3/",
pages = "14--19",
ISBN = "979-8-89176-304-3",
abstract = "$\textit{Glossing}$ is the task of translating from a written language into a sequence of $\textit{glosses}$, i.e., textual representations of signs from some sign language. While glossing is in principle `just' a machine translation (MT) task, sign languages still lack the large parallel corpora that exist for many written language pairs and underlie the development of dedicated MT systems. In this work, we demonstrate that glossing can be significantly improved through data augmentation. We fine-tune a Spanish transformer model both on a small dedicated corpus 3,000 Spanish{--}Mexican Sign Language (MSL) gloss sentence pairs, and on a corpus augmented with an English{--}American Sign Language (ASL) gloss corpus. We obtain the best results when we oversample from the ASL corpus by a factor of {\textasciitilde}4, achieving a BLEU increase from 62 to 85 and a TER reduction from 44 to 20. This demonstrates the usefulness of combining corpora in low-resource glossing situations."
}Markdown (Informal)
[Low-Resource Sign Language Glossing Profits From Data Augmentation](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.wslp-main.3/) (Ortiz & Padó, WSLP 2025)
ACL