@inproceedings{hessel-lee-2020-multimodal,
title = "Does my multimodal model learn cross-modal interactions? It`s harder to tell than you might think!",
author = "Hessel, Jack and
Lee, Lillian",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.62/",
doi = "10.18653/v1/2020.emnlp-main.62",
pages = "861--877",
abstract = "Modeling expressive cross-modal interactions seems crucial in multimodal tasks, such as visual question answering. However, sometimes high-performing black-box algorithms turn out to be mostly exploiting unimodal signals in the data. We propose a new diagnostic tool, empirical multimodally-additive function projection (EMAP), for isolating whether or not cross-modal interactions improve performance for a given model on a given task. This function projection modifies model predictions so that cross-modal interactions are eliminated, isolating the additive, unimodal structure. For seven image+text classification tasks (on each of which we set new state-of-the-art benchmarks), we find that, in many cases, removing cross-modal interactions results in little to no performance degradation. Surprisingly, this holds even when expressive models, with capacity to consider interactions, otherwise outperform less expressive models; thus, performance improvements, even when present, often cannot be attributed to consideration of cross-modal feature interactions. We hence recommend that researchers in multimodal machine learning report the performance not only of unimodal baselines, but also the EMAP of their best-performing model."
}
Markdown (Informal)
[Does my multimodal model learn cross-modal interactions? It’s harder to tell than you might think!](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.emnlp-main.62/) (Hessel & Lee, EMNLP 2020)
ACL