@inproceedings{gupta-etal-2026-speculative,
title = "Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks",
author = "Gupta, Aditi and
Mishra, Neel and
Trivedi, Kushagra and
Kumar, Pawan",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.33/",
pages = "355--363",
ISBN = "979-8-89176-423-1",
abstract = "How should we evaluate generation systems that combine autoregressive (AR) and diffusion decoding?We study this question through *Speculative Refinement* (SpecRef), a training-free hybrid method that warm-starts a masked diffusion language model from an AR draft using entropy-guided selective masking.Evaluating SpecRef across six benchmarks (HumanEval, MBPP, GSM8K, BBH, ARC-Challenge, HellaSwag) with three distinct evaluation protocols (execution-based pass@1, exact-match, log-likelihood scoring), we surface several findings relevant beyond our specific system:(1) code benchmarks conflate structural discovery with logical correctness: providing a syntactic scaffold lifts accuracy from near zero to over 20{\%} without changing the model, indicating that much of the baseline failure is structural;(2) a *refinement tension* phenomenon where multi-stage correction degrades already-correct tokens, exposing benchmark saturation ceilings invisible to single-model evaluation;(3) log-likelihood and generative evaluation produce different model rankings for the same model pair, suggesting they measure different capabilities;(4) standard Python post-processing silently breaks code evaluation for non-AR generators.These observations apply to any multi-stage or non-autoregressive generation pipeline and point toward more diagnostic evaluation practices."
}Markdown (Informal)
[Speculative Refinement: A Hybrid Autoregressive Diffusion Decoding Strategy and Its Behavior Across Benchmarks](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.33/) (Gupta et al., GEM 2026)
ACL