@inproceedings{kim-yoon-2026-joint,
title = "Joint Multimodal Preference Optimization for Fine-Grained Visual-Textual Alignment",
author = "Kim, Jiwon and
Yoon, Hyunsoo",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.5/",
pages = "79--94",
ISBN = "979-8-89176-386-9",
abstract = "Recent research has focused on addressing multimodal hallucinations in Large Vision-Language Models (LVLMs) by extending Direct Preference Optimization (DPO) to incorporate visual preference supervision. However, these methods often lack fine-grained visual contrast mechanisms and rely on single-margin optimization. This in turn limits their ability to capture precise visual semantics and results in weak multimodal alignment. To address these issues, we propose Joint Multimodal Preference Optimization (JoMPO), a novel optimization framework that symmetrically integrates a text-conditioned preference loss with a visual ranking-based objective. JoMPO leverages semantically contrastive image{--}text pairs and listwise ranking over multiple visual contexts, enabling fine-grained visual grounding and more robust cross-modal alignment. To support this framework, we introduce the Visual{--}Textual Contrast (VTC) dataset, consisting of image pairs that are semantically similar but visually distinct, each paired with a contextually grounded textual response. When trained with only 5k contrastive pairs, JoMPO consistently demonstrates superior performance across diverse benchmarks, highlighting its effectiveness in mitigating hallucinations and improving image-text alignment in LVLMs."
}Markdown (Informal)
[Joint Multimodal Preference Optimization for Fine-Grained Visual-Textual Alignment](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.5/) (Kim & Yoon, Findings 2026)
ACL