@inproceedings{ranaldi-etal-2025-r2,
title = "{R}2-{M}ulti{O}mnia: Leading Multilingual Multimodal Reasoning via Self-Training",
author = "Ranaldi, Leonardo and
Ranaldi, Federico and
Pucci, Giulia",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.402/",
pages = "8220--8234",
ISBN = "979-8-89176-251-0",
abstract = "Reasoning is an intricate process that transcends both language and vision; yet, despite its inherently modality-agnostic nature, develop-ing effective multilingual and multimodal reasoning capabilities remains a substantial challenge for Multimodal Large Language Models (MLLMs). They struggle to activate complex reasoning behaviours, delivering step-wise explanation, questioning and reflection, particularly in multilingual settings where high-quality supervision across languages is lacking. Recent works have introduced eclectic strategies to enhance MLLMs' reasoning; however, they remain related to a single language.To make MLLMs' reasoning capabilities aligned among languages and improve modality performances, we propose R2-MultiOmnia, a modular approach that instructs the models to abstract key elements of the reasoning process and then refine reasoning trajectories via self-correction. Specifically, we instruct the models producing multimodal synthetic resources by bridging modalities and then self-improving their capabilities. To stabilise learning and the reasoning processes structure, we propose Curriculum Learning Reasoning Stabilisation with structured output rewards to gradually refine the models' capabilities to learn and deliver robust reasoning processes. Experiments show that R2-MultiOmnia improves multimodal reasoning, gets aligned performances among the languages approaching strong models."
}
Markdown (Informal)
[R2-MultiOmnia: Leading Multilingual Multimodal Reasoning via Self-Training](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.402/) (Ranaldi et al., ACL 2025)
ACL