@inproceedings{ji-lu-2025-reflair,
title = "{R}e{FLAIR}: Enhancing Multimodal Reasoning via Structured Reflection and Reward-Guided Learning",
author = "Ji, Jiazhou and
Lu, Xinru",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1384/",
doi = "10.18653/v1/2025.findings-emnlp.1384",
pages = "25401--25413",
ISBN = "979-8-89176-335-7",
abstract = "Large models can achieve higher performance on complex problems through iterative self-reflection. Yet when reflection is uncontrolled, it often leads to longer outputs, higher inference cost, and an increased risk of hallucination. Existing training methods rarely address this trade off. We introduce ReFLAIR, a unified framework that teaches multimodal large models to perform structured reflection via an explicit {\$}think re-think answer {\$} format and hybrid reward learning. ReFLAIR begins with supervised cold start training on the ReFLAIR-cold dataset of curated multimodal reasoning trajectories, and then trains a Reflection Quality Scorer (RQS) to quantify the utility of rethinking steps. A modified Group Relative Policy Optimization algorithm optimizes a hybrid reward that combines answer correctness, structural fidelity, reflection utility, and sample difficulty. Evaluated on challenging mathematical benchmarks including MathVista, MathVerse, MM-Math and GSM8K, ReFLAIR yields improvements up to +12.2{\%} absolute accuracy, produces higher quality reflective traces, and reduces harmful or redundant revisions. An adaptive test time reflection scheduler further reduces inference cost by nearly 23{\%} while maintaining or improving accuracy. These results demonstrate that structured, reward guided reflection offers a scalable pathway to more reliable and interpretable reasoning in multimodal models."
}Markdown (Informal)
[ReFLAIR: Enhancing Multimodal Reasoning via Structured Reflection and Reward-Guided Learning](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1384/) (Ji & Lu, Findings 2025)
ACL