@article{wei-etal-2026-multimodal,
title = "Do Multimodal {LLM}s Understand Order? Measuring the Fragility of Multimodal Reasoning under Input Order Perturbations",
author = "Wei, Sheng-Lun and
Liao, Yu-Ling and
Huang, Hen-Hsen and
Chen, Hsin-Hsi",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.716/",
pages = "9118--9128",
abstract = "Multimodal reasoning has progressed rapidly with large vision-language models (LVLMs), yet their robustness under input variations remains underexplored. This study investigates positional bias in LVLMs for multimodal multiple-choice questions. Our analysis shows that model predictions are sensitive to both choice and modality ordering. We conduct a large-scale evaluation on MMMU, CVQA, and MMBench using fourteen representative models. Further analysis examines how question properties, including difficulty, domain, and image type, affect robustness. We also assess whether text-based mitigation strategies transfer to the VQA setting and perform ablation studies on self-consistency and reasoning complexity. Overall, our findings provide the first comprehensive understanding of positional bias from a vision-language perspective, highlighting key challenges in achieving stable multimodal reasoning."
}Markdown (Informal)
[Do Multimodal LLMs Understand Order? Measuring the Fragility of Multimodal Reasoning under Input Order Perturbations](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.716/) (Wei et al., LREC 2026)
ACL