@inproceedings{azim-etal-2026-adaptive,
title = "Adaptive Weighted Proxy Tuning: Efficient Gray-Box Steering for Image Captioning.",
author = "Azim, Nafew and
Rahman, Fuad and
Mohammed, Nabeel",
editor = "Li, Yunyao and
Rehm, Georg and
Tu, Mei",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-industry.85/",
pages = "1197--1217",
ISBN = "979-8-89176-394-4",
abstract = "Adapting Large Vision-Language Models (LVLMs) to specialized domains typically demands resource-intensive fine-tuning or access to proprietary parameters ({``}white-box'' access). While decoding-time strategies like Proxy Tuning offer a parameter-efficient alternative, they rely on rigid, static logit arithmetic that fails to account for instance-specific variations in model certainty and domain shift. In this work, we introduce Adaptive Weighted Proxy Tuning (AWPT), a gray-box steering framework that dynamically modulates the logit contributions of a large base model, a fine-tuned expert, and an untuned anti-expert. Unlike static approaches, AWPT introduces two instance-aware mechanisms: (1) a lightweight ViT-based Weight Predictor that performs amortized inference to estimate optimal mixing coefficients in real-time with negligible added latency ($\sim$0.03s overhead), and (2) a Per-Sample Optimization objective that establishes theoretical performance bounds via gradient-based logit steering. Extensive evaluation across medical (ROCOv2, IU-Xray) and general domains (Flickr30k, MS COCO, TextCaps) demonstrates that AWPT achieves performance parity with fully fine-tuned models while remaining parameter-free regarding the generator. Crucially, our dynamic weighting acts as an effective regularizer, significantly reducing object hallucinations and establishing AWPT as a robust solution for deploying general-purpose LVLMs in safety-critical contexts."
}Markdown (Informal)
[Adaptive Weighted Proxy Tuning: Efficient Gray-Box Steering for Image Captioning.](https://preview.aclanthology.org/ingest-acl/2026.acl-industry.85/) (Azim et al., ACL 2026)
ACL