@inproceedings{bilen-etal-2026-visaffect,
title = "{V}is{A}ffect at {MWE}-2026 {A}d{MIR}e 2: {IMMCAN} Idiom Multimodal Cross-Attention Network",
author = "Bilen, Bar{\i}{\c{s}} and
Azmoudeh, Ali and
Ekenel, Haz{\i}m Kemal and
Kose, Hatice",
editor = {Ojha, Atul Kr. and
Mititelu, Verginica Barbu and
Constant, Mathieu and
Stoyanova, Ivelina and
Do{\u{g}}ru{\"o}z, A. Seza and
Rademaker, Alexandre},
booktitle = "Proceedings of the 22nd Workshop on Multiword Expressions ({MWE} 2026)",
month = mar,
year = "2026",
address = "Rabat, Marocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.mwe-1.19/",
pages = "149--153",
ISBN = "979-8-89176-363-0",
abstract = "We address AdMIRe 2.0, a static image ranking task where a sentence containing a potentially idiomatic expression is paired with five image{--}caption candidates, and the goal is to rank the candidates by semantic compatibility with the intended idiomatic or literal meaning. We propose IMMCAN, which keeps XLM-R and Jina-CLIP-v2 frozen and learns a lightweight two-stage cross-attention fusion, caption{--}image grounding followed by idiom-to-multimodal conditioning, to predict a compatibility score per candidate. We also evaluate caption-only augmentation via back-translation and synonym substitution, and compare regression and rank-class formulations. On AdMIRe 1.0, text-only achieves higher test top-image accuracy than VLM-grounded modeling. In contrast, on AdMIRe 2.0 zero-shot, adding visual patch grounding improves both accuracy and NDCG indicating better cross-lingual ranking transfer."
}Markdown (Informal)
[VisAffect at MWE-2026 AdMIRe 2: IMMCAN Idiom Multimodal Cross-Attention Network](https://preview.aclanthology.org/ingest-eacl/2026.mwe-1.19/) (Bilen et al., MWE 2026)
ACL