@inproceedings{feng-etal-2025-masked,
title = "Masked Diffusion Captioning for Visual Feature Learning",
author = "Feng, Chao and
Wei, Zihao and
Owens, Andrew",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1376/",
doi = "10.18653/v1/2025.findings-emnlp.1376",
pages = "25247--25263",
ISBN = "979-8-89176-335-7",
abstract = "We learn visual features by captioning images with an image-conditioned masked diffusion language model, a formulation we call masked diffusion captioning (MDC). During training, text tokens in each image{--}caption pair are masked at a randomly chosen ratio, and a decoder conditioned on visual features is trained to reconstruct the original text. After training, the learned visual features can be applied to downstream vision tasks. Unlike autoregressive captioning, the strength of the visual learning signal in MDC does not depend on each token{'}s position in the sequence, reducing the need for auxiliary objectives. Linear probing experiments across a variety of academic-scale models and datasets show that the learned visual features are competitive with those produced by autoregressive and contrastive approaches."
}Markdown (Informal)
[Masked Diffusion Captioning for Visual Feature Learning](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1376/) (Feng et al., Findings 2025)
ACL