@inproceedings{sosea-caragea-2025-hard,
title = "Hard Emotion Test Evaluation Sets for Language Models",
author = "Sosea, Tiberiu and
Caragea, Cornelia",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2025",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.findings-naacl.443/",
pages = "7930--7944",
ISBN = "979-8-89176-195-7",
abstract = "Language models perform well on emotion datasets but it remains unclear whether these models indeed understand emotions expressed in text or simply exploit supperficial lexical cues (e.g., emotion words). In this paper, we present two novel test evaluation sets sourced from two existing datasets that allow us to evaluate whether language models make real inferential decisions for emotion detection or not. Our human-annotated test sets are created by iteratively rephrasing input texts to gradually remove explicit emotion cues (while preserving the semantic similarity and the emotions) until a strong baseline BERT model yields incorrect predictions. Using our new test sets, we carry out a comprehensive analysis into the capabilities of small and large language models to predict emotions. Our analysis reveals that all models struggle to correctly predict emotions when emotion lexical cues become scarcer and scarcer, but large language models perform better than small pre-trained language models and push the performance by 14{\%} over the 5{\%} BERT baseline. We make our evaluation test sets and code publicly available."
}
Markdown (Informal)
[Hard Emotion Test Evaluation Sets for Language Models](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.findings-naacl.443/) (Sosea & Caragea, Findings 2025)
ACL