@inproceedings{gurjar-krishnaswamy-2026-scale,
title = "Scale Is All You Need: Analyzing Modality Interaction and Speaker Intent Without Fine-Tuning",
author = "Gurjar, Animesh and
Krishnaswamy, Nikhil",
editor = "Baez Santamaria, Selene and
Somayajula, Sai Ashish and
Yamaguchi, Atsuki",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 4: Student Research Workshop)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.36/",
pages = "483--492",
ISBN = "979-8-89176-383-8",
abstract = "Understanding sarcasm requires integrating cues from language, voice, and facial expression. Recent work has achieved impressive results using large multimodal Transformers, but such models are computationally expensive and often obscure how each modality contributes to the final prediction. This paper introduces a lightweight, interpretable framework for multimodal sarcasm detection that combines frozen text, audio, and visual embeddings from pretrained encoders through compact fusion heads. Using the MUStARD++Balanced dataset, we show that early fusion of textual and acoustic features improves over the best unimodal baseline. Character-specific evaluation further shows that sarcasm expressed through overt prosodic and visual cues is substantially easier to detect than monotone, context-dependent sarcasm. Additionally, we evaluate generalization to different characters through leave-one-speaker-out (LOSO) experiments and run ablation-style transfer experiments on two speakers with similar sarcasm distributions. These findings demonstrate that effective multimodal sarcasm understanding can emerge from frozen, resource-efficient representations without large-scale fine-tuning, emphasizing the importance of modality interaction and delivery style rather than model scale."
}Markdown (Informal)
[Scale Is All You Need: Analyzing Modality Interaction and Speaker Intent Without Fine-Tuning](https://preview.aclanthology.org/ingest-eacl/2026.eacl-srw.36/) (Gurjar & Krishnaswamy, EACL 2026)
ACL