@inproceedings{ye-etal-2026-hybrid,
title = "Hybrid Autoregressive-Diffusion Model for Real-Time Sign Language Production",
author = "Ye, Maoxiao and
Ye, Xinfeng and
Manoharan, Sathiamoorthy",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.31/",
pages = "750--763",
ISBN = "979-8-89176-390-6",
abstract = "Earlier Sign Language Production (SLP) models typically relied on autoregressive methods that generate output tokens one by one, which inherently provide temporal alignment. Although techniques like Teacher Forcing can prevent model collapse during training, they still cannot solve the problem of error accumulation during inference, since ground truth is unavailable at that stage. In contrast, more recent approaches based on diffusion models leverage step-by-step denoising to enable high-quality generation. However, the iterative nature of these models and the requirement to denoise entire sequences limit their applicability in real-time tasks like SLP. To address it, we propose a hybrid autoregressive-diffusion model for Sign Language Production (SLP), combining sequential dependency modeling with iterative refinement. A Multi-Scale Pose Representation module captures fine-grained articulator features, while a Confidence-Aware Causal Attention mechanism guides generation using joint-level confidence scores. Experiments on PHOENIX14T and How2Sign show improved generation quality and real-time efficiency."
}Markdown (Informal)
[Hybrid Autoregressive-Diffusion Model for Real-Time Sign Language Production](https://preview.aclanthology.org/ingest-acl/2026.acl-long.31/) (Ye et al., ACL 2026)
ACL