@inproceedings{hu-tu-2026-region,
title = "Region-R1: Reinforcing Query-Side Region Cropping for Multi-Modal Re-Ranking",
author = "Hu, Chan-Wei and
Tu, Zhengzhong",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.510/",
pages = "10492--10505",
ISBN = "979-8-89176-395-1",
abstract = "Multi-modal retrieval-augmented generation (MM-RAG) relies heavily on re-rankers to surface the most relevant evidence for image-question queries. However, standard re-rankers typically process the full query image as a global embedding, making them susceptible to visual distractors (e.g., background clutter) that skew similarity scores.We propose **Region-R1**, a query-side region cropping framework that formulates region selection as a decision-making problem during re-ranking, allowing the system to learn to retain the full image or focus only on a question-relevant region before scoring the retrieved candidates. Region-R1 learns a policy with a novel region-aware group relative policy optimization (r-GRPO) to dynamically crop a discriminative region. Across two challenging benchmarks, E-VQA and InfoSeek, Region-R1 delivers consistent gains, achieving state-of-the-art performances by increasing conditional Recall@1 by up to 20{\%}. These results show the great promise of query-side adaptation as a simple but effective way to strengthen MM-RAG re-ranking."
}Markdown (Informal)
[Region-R1: Reinforcing Query-Side Region Cropping for Multi-Modal Re-Ranking](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.510/) (Hu & Tu, Findings 2026)
ACL