@article{wernlein-etal-2026-early,
title = "Early Fusion with Contrastive Learning: A Lightweight Alternative for Multi-modal Classification",
author = "Wernlein, Felix and
Jana, Abhik and
Sikdar, Sandipan",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.717/",
pages = "9129--9138",
abstract = "With the emergence of numerous modalities, such as text, image, audio, etc., the use of effective multimodal systems has increased significantly. However, one of the significant challenges faced by such multimodal systems is effectively aligning and integrating diverse modalities. Several models have been proposed to address these issues; however, state-of-the-art performance is achieved by complex, heavyweight models (complexity measured in terms of trainable parameters) alone. Hence, we propose a simple yet effective lightweight framework explicitly designed for multimodal classification tasks, utilising the early fusion method combined with a contrastive learning approach. The early fusion method focuses on fusing different modalities at the input level, whereas contrastive learning allows a single modality to capture intra-modality relationships. Experiments on three different genres of multimodal classification datasets demonstrate that the proposed lightweight framework achieves performance comparable to the most competitive heavyweight state-of-the-art models and, in some cases, even outperforms them."
}Markdown (Informal)
[Early Fusion with Contrastive Learning: A Lightweight Alternative for Multi-modal Classification](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.717/) (Wernlein et al., LREC 2026)
ACL