@inproceedings{sahay-etal-2020-low,
title = "Low Rank Fusion based Transformers for Multimodal Sequences",
author = "Sahay, Saurav and
Okur, Eda and
H Kumar, Shachi and
Nachman, Lama",
editor = "Zadeh, Amir and
Morency, Louis-Philippe and
Liang, Paul Pu and
Poria, Soujanya",
booktitle = "Second Grand-Challenge and Workshop on Multimodal Language (Challenge-HML)",
month = jul,
year = "2020",
address = "Seattle, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.challengehml-1.4/",
doi = "10.18653/v1/2020.challengehml-1.4",
pages = "29--34",
abstract = "Our senses individually work in a coordinated fashion to express our emotional intentions. In this work, we experiment with modeling modality-specific sensory signals to attend to our latent multimodal emotional intentions and vice versa expressed via low-rank multimodal fusion and multimodal transformers. The low-rank factorization of multimodal fusion amongst the modalities helps represent approximate multiplicative latent signal interactions. Motivated by the work of (CITATION) and (CITATION), we present our transformer-based cross-fusion architecture without any over-parameterization of the model. The low-rank fusion helps represent the latent signal interactions while the modality-specific attention helps focus on relevant parts of the signal. We present two methods for the Multimodal Sentiment and Emotion Recognition results on CMU-MOSEI, CMU-MOSI, and IEMOCAP datasets and show that our models have lesser parameters, train faster and perform comparably to many larger fusion-based architectures."
}
Markdown (Informal)
[Low Rank Fusion based Transformers for Multimodal Sequences](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.challengehml-1.4/) (Sahay et al., Challenge-HML 2020)
ACL