@inproceedings{zou-etal-2026-dance,
title = "{DANCE}: Diversity-attended Dynamic Caching with Asymmetric Quantization for Test-time Adaptation of Vision-Language Models",
author = "Zou, Shunge and
Wang, Changhu and
Ju, Wei and
Qiao, Ziyue and
Luo, Xiao",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1860/",
pages = "37329--37343",
ISBN = "979-8-89176-395-1",
abstract = "This paper studies the problem of test-time adaptation for vision-language models (VLMs). Recent approaches typically measure the prediction entropy to store a confident cache for logit refinement. However, these confident samples tend to approach prototypes with limited coverage of data distribution, which could result in biased predictions as the distribution evolves. Towards this end, we propose a novel approach named Diversity-attended Dynamic Caching with Asymmetric Quantization (DANCE) for test-time adaptation of VLMs. The core of our DANCE is to maintain a dynamic cache to store diversity-aware test samples, which support efficient logit adjustment via asymmetric quantization. In particular, we first generate multiple augmented views of each sample and aggregate their outputs from pre-trained VLMs via a consistency-aware mechanism. More importantly, we construct a dynamic cache, which stores the most reliable and diverse samples to cover evolving test distributions. To measure the diversity efficiently, we quantize cached samples and compute the asymmetric similarity across query samples and memory samples, which guide the cache updating via replacing samples with the lowest scores iteratively. Finally, we leverage the asymmetric similarity between the quantized prototype representations from the dynamic cache to update logits under distribution shifts. Extensive experiments on various benchmark datasets validate the superiority of the proposed DANCE in different settings."
}Markdown (Informal)
[DANCE: Diversity-attended Dynamic Caching with Asymmetric Quantization for Test-time Adaptation of Vision-Language Models](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1860/) (Zou et al., Findings 2026)
ACL