@inproceedings{ling-etal-2025-ecommmmu,
title = "{E}com{MMMU}: Strategic Utilization of Visuals for Robust Multimodal {E}-commerce Models",
author = "Ling, Xinyi and
Du, Hanwen and
Zhu, Zhihui and
Ning, Xia",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.43/",
pages = "769--790",
ISBN = "979-8-89176-298-5",
abstract = "E-commerce platforms are rich in multimodal data, featuring a variety of images that depict product details. However, this raises an important question: do these images always enhance product understanding, or can they sometimes introduce redundancy or degrade performance? Existing datasets are limited in both scale and design, making it difficult to systematically examine this question. To this end, we introduce EcomMMMU, an e-commerce multimodal multitask understanding dataset with 406,190 samples and 8,989,510 images. EcomMMMU is comprised of multi-image visual-language data designed with 8 essential tasks and a specialized VSS subset to benchmark the capability of multimodal large language models (MLLMs) to effectively utilize visual content. Analysis on EcomMMMU reveals that product images do not consistently improve performance and can, in some cases, degrade it. This indicates that MLLMs may struggle to effectively leverage rich visual content for e-commerce tasks. Building on these insights, we propose SUMEI, a data-driven method that strategically utilizes multiple images via predicting visual utilities before using them for downstream tasks. Comprehensive experiments demonstrate the effectiveness and robustness of SUMEI. The data and code are available through https://github.com/ninglab/EcomMMMU."
}Markdown (Informal)
[EcomMMMU: Strategic Utilization of Visuals for Robust Multimodal E-commerce Models](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.43/) (Ling et al., IJCNLP-AACL 2025)
ACL
- Xinyi Ling, Hanwen Du, Zhihui Zhu, and Xia Ning. 2025. EcomMMMU: Strategic Utilization of Visuals for Robust Multimodal E-commerce Models. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 769–790, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.