@inproceedings{ling-etal-2025-captions,
title = "Captions Speak Louder than Images: Generalizing Foundation Models for {E}-commerce from High-quality Multimodal Instruction Data",
author = "Ling, Xinyi and
Du, Hanwen and
Peng, Bo and
Zhu, Zhihui and
Ning, Xia",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.42/",
pages = "743--768",
ISBN = "979-8-89176-298-5",
abstract = "Multimodal foundation models (MFMs) have demonstrated strong capabilities in e-commerce by effectively leveraging multimodal data to enhance product understanding and user experienceHowever, the development of e-commerce MFMs is hindered by two challenges: (1) the scarcity of large-scale, high-quality multimodal benchmark datasets; and (2) the lack of effective multimodal information integration methods in e-commerce. To address these challenges, we introduce MMECInstruct, the first large-scale, high-quality multimodal instruction dataset designed specifically for e-commerce MFMs. MMECInstruct comprises 75,000 samples covering 7 real-world e-commerce tasks, supporting both in-domain (IND) and out-of-domain (OOD) evaluations. Leveraging MMECInstruct, we develop CASLIE, a lightweight framework that enhances multimodal information understanding and integration for e-commerce. Our comprehensive evaluation demonstrates that MMECInstruct endows CASLIE with advanced capability and strong generalizability in e-commerce applications. MMECInstruct and CASLIE models are publicly accessible through https://github.com/ninglab/CASLIE."
}Markdown (Informal)
[Captions Speak Louder than Images: Generalizing Foundation Models for E-commerce from High-quality Multimodal Instruction Data](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.ijcnlp-long.42/) (Ling et al., IJCNLP-AACL 2025)
ACL