@inproceedings{chen-etal-2021-multimodal-item,
title = "Multimodal Item Categorization Fully Based on Transformer",
author = "Chen, Lei and
Chou, Houwei and
Xia, Yandi and
Miyake, Hirokazu",
editor = "Malmasi, Shervin and
Kallumadi, Surya and
Ueffing, Nicola and
Rokhlenko, Oleg and
Agichtein, Eugene and
Guy, Ido",
booktitle = "Proceedings of the 4th Workshop on e-Commerce and NLP",
month = aug,
year = "2021",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.ecnlp-1.13/",
doi = "10.18653/v1/2021.ecnlp-1.13",
pages = "111--115",
abstract = "The Transformer has proven to be a powerful feature extraction method and has gained widespread adoption in natural language processing (NLP). In this paper we propose a multimodal item categorization (MIC) system solely based on the Transformer for both text and image processing. On a multimodal product data set collected from a Japanese e-commerce giant, we tested a new image classification model based on the Transformer and investigated different ways of fusing bi-modal information. Our experimental results on real industry data showed that the Transformer-based image classifier has performance on par with ResNet-based classifiers and is four times faster to train. Furthermore, a cross-modal attention layer was found to be critical for the MIC system to achieve performance gains over text-only and image-only models."
}
Markdown (Informal)
[Multimodal Item Categorization Fully Based on Transformer](https://preview.aclanthology.org/fix-sig-urls/2021.ecnlp-1.13/) (Chen et al., ECNLP 2021)
ACL