@inproceedings{li-etal-2025-instructany2pix,
title = "{I}nstruct{A}ny2{P}ix: Image Editing with Multi-Modal Prompts",
author = "Li, Shufan and
Singh, Harkanwar and
Grover, Aditya",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-06/2025.findings-naacl.36/",
doi = "10.18653/v1/2025.findings-naacl.36",
pages = "594--619",
ISBN = "979-8-89176-195-7",
abstract = "Image Editing has made incredible progress in recent years. Earliest work only supported caption-guided editing. Recently, free-form text instructions and reference images are incorporated to allow more flexibility. However, existing methods still struggle with complicated editing instructions involving multiple objects or reference images. We present InstructAny2Pix, a novel image editing model that leverages a multi-modal LLM to execute complicated edit instructions. Compared with previous, works, InstructAny2Pix extends the flexibility of edit instructions in three ways: First, it can perform complex instructions involving multiple object edits; Second, it supports interleaving text instructions with multiple reference images; Third, it supports audio and music inputs as part of edit prompts, unlocking many creative applications, such as album cover generation and music-inspired merchandise design. To evaluate the effectiveness of InstructAny2Pix, we propose two new benchmark datasets MM-Inst and Dream-booth++ consisting of human written, multi-modal prompts. InstructAny2Pix outperforms baselines in these two proposed multi-modal benchmarks, as well as conventional image editing benchmarks such as InstructPix2Pix."
}
Markdown (Informal)
[InstructAny2Pix: Image Editing with Multi-Modal Prompts](https://preview.aclanthology.org/corrections-2025-06/2025.findings-naacl.36/) (Li et al., Findings 2025)
ACL