@inproceedings{qian-kang-2026-penny,
title = "``Penny Wise, Pixel Foolish'': Bypassing Price Constraints in Multimodal Agents via Visual Adversarial Perturbations",
author = "Qian, Jiachen and
Kang, Zhaolu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.788/",
pages = "16059--16073",
ISBN = "979-8-89176-395-1",
abstract = "The rapid proliferation of Multimodal Large Language Models (MLLMs) has ushered in the era of the ``Agentic Economy,'' where Mobile Agents autonomously execute high-stakes financial transactions. While these agents demonstrate impressive operational capabilities, their adversarial robustness remains a glaring blind spot. In this paper, we identify a systemic vulnerability termed Visual Dominance Hallucination (VDH), where imperceptible adversarial visual cues can act as a ``super-stimulus,'' overriding textual price evidence in our evaluated screenshot-based price-constrained settings and forcing the agent into irrational economic decisions. We propose PriceBlind, a stealthy, white-box adversarial attack framework for controlled screenshot-based evaluation. Unlike prior works that rely on conspicuous artifacts like pop-ups, PriceBlind exploits the modality gap in CLIP-based encoders via a novel Semantic-Decoupling Loss. Rather than literally making a luxury item ``look cheap,'' this regularizer weakens the consistency between high-price text and visual value cues by aligning the image embedding with a low-cost/value-associated anchor region while preserving pixel-level fidelity. On our main E-ShopBench benchmark with clear price constraints, screenshot-based white-box evaluation yields ASRs around 80{\%} on the evaluated agents. Under the evaluated single-turn coordinate-selection protocol in a simplified layout-aware setting, our Ensemble-DI-FGSM strategy also yields non-trivial black-box transfer, with ASR roughly 35{--}41{\%} across GPT-4o, Gemini-1.5-Pro, and Claude-3.5-Sonnet. In the same screenshot-based setting, standard robust encoders reduce ASR only partially, while a Verify-then-Act stack with robust encoders lowers ASR to below 10{\%} at some clean-accuracy cost."
}Markdown (Informal)
["Penny Wise, Pixel Foolish": Bypassing Price Constraints in Multimodal Agents via Visual Adversarial Perturbations](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.788/) (Qian & Kang, Findings 2026)
ACL