@inproceedings{han-etal-2026-minigpt4,
title = "{M}-{M}ini{GPT}4: Multilingual {VLLM} Alignment via Translated Data",
author = "Han, Seung Hun Eddie and
Mohamed, Youssef and
Elhoseiny, Mohamed",
editor = "Chimoto, Everlyn Asiko and
Lignos, Constantine and
Muhammad, Shamsuddeen and
Abdulmumin, Idris and
Siro, Clemencia and
Adelani, David Ifeoluwa",
booktitle = "Proceedings of the 7th Workshop on {A}frican Natural Language Processing ({A}frica{NLP} 2026)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/manual-author-scripts/2026.africanlp-main.2/",
pages = "11--16",
ISBN = "979-8-89176-364-7",
abstract = "This paper presents a Multilingual Vision Large Language Model, named M-MiniGPT4. Our model exhibits strong vision-language understanding (VLU) capabilities across 11 languages. We utilize a mixture of native multilingual and translated data to push the multilingual VLU performance of the MiniGPT4 architecture. In addition, we propose a multilingual alignment training stage that uses parallel text corpora to further enhance the multilingual capabilities of our model. M-MiniGPT4 achieves 36{\%} accuracy on the multilingual MMMU benchmark, outperforming state-of-the-art models in the same weight class, including foundation models released after the majority of this work was completed. We open-source our models, code, and translated datasets to facilitate future research in low-resource and multilingual settings."
}Markdown (Informal)
[M-MiniGPT4: Multilingual VLLM Alignment via Translated Data](https://preview.aclanthology.org/manual-author-scripts/2026.africanlp-main.2/) (Han et al., AfricaNLP 2026)
ACL