@inproceedings{shi-etal-2026-revealer,
title = "{REVEALER}: Reinforcement-Guided Visual Reasoning for Element-Level Text-Image Alignment Evaluation",
author = "Shi, FuLin and
Xiao, Wenyi and
Gan, Leilei and
Ding, Liang and
Binchen",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.2200/",
pages = "47630--47649",
ISBN = "979-8-89176-390-6",
abstract = "Evaluating the alignment between textual prompts and generated images is critical for ensuring the reliability and usability of text-to-image (T2I) models. However, most existing evaluation methods rely on coarse-grained metrics or static Question Answering (QA) pipelines, which lack fine-grained interpretability and struggle to reflect human preferences. To address this, we propose $\textbf{REVEALER}$, a reinforcement-guided visual reasoning framework for element-level text-to-image alignment evaluation. Adopting a structured $''grounding–reasoning–conclusion''$ paradigm, our method enables Multimodal Large Language Models (MLLMs) to explicitly localize semantic elements and derive interpretable alignment judgments. We optimize the model via Group Relative Policy Optimization (GRPO) using a multi-dimensional reward function that targets format compliance, localization precision, and alignment accuracy.Extensive experiments confirm that REVEALER achieves state-of-the-art results across four benchmarks. Notably, on EvalMuse-40K, it surpasses the strong proprietary Gemini 3 Pro and Training-based baselines with absolute accuracy gains of $\textbf{+4.2\%}$ and $\textbf{+13.3\%}$, respectively. Ablation studies further demonstrate the efficacy of our method, contributing a cumulative $\textbf{19.6\%}$ improvement over the base model."
}Markdown (Informal)
[REVEALER: Reinforcement-Guided Visual Reasoning for Element-Level Text-Image Alignment Evaluation](https://preview.aclanthology.org/ingest-acl/2026.acl-long.2200/) (Shi et al., ACL 2026)
ACL