@inproceedings{ahn-etal-2025-llms,
title = "Can {LLM}s Deceive {CLIP}? Benchmarking Adversarial Compositionality of Pre-trained Multimodal Representation via Text Updates",
author = "Ahn, Jaewoo and
Yun, Heeseung and
Ko, Dayoon and
Kim, Gunhee",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1280/",
pages = "26382--26402",
ISBN = "979-8-89176-251-0",
abstract = "While pre-trained multimodal representations (e.g., CLIP) have shown impressive capabilities, they exhibit significant compositional vulnerabilities leading to counterintuitive judgments. We introduce Multimodal Adversarial Compositionality (MAC), a benchmark that leverages large language models (LLMs) to generate deceptive text samples to exploit these vulnerabilities across different modalities and evaluates them through both sample-wise attack success rate and group-wise entropy-based diversity. To improve zero-shot methods, we propose a self-training approach that leverages rejection-sampling fine-tuning with diversity-promoting filtering, which enhances both attack success rate and sample diversity. Using smaller language models like Llama-3.1-8B, our approach demonstrates superior performance in revealing compositional vulnerabilities across various multimodal representations, including images, videos, and audios."
}
Markdown (Informal)
[Can LLMs Deceive CLIP? Benchmarking Adversarial Compositionality of Pre-trained Multimodal Representation via Text Updates](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1280/) (Ahn et al., ACL 2025)
ACL