@inproceedings{sampath-etal-2025-seer,
title = "{SEER}: The Span-based Emotion Evidence Retrieval Benchmark",
author = "Sampath, Aneesha and
Aran, Oya and
Mower Provost, Emily",
editor = "Inui, Kentaro and
Sakti, Sakriani and
Wang, Haofen and
Wong, Derek F. and
Bhattacharyya, Pushpak and
Banerjee, Biplab and
Ekbal, Asif and
Chakraborty, Tanmoy and
Singh, Dhirendra Pratap",
booktitle = "Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics",
month = dec,
year = "2025",
address = "Mumbai, India",
publisher = "The Asian Federation of Natural Language Processing and The Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.76/",
pages = "1248--1267",
ISBN = "979-8-89176-303-6",
abstract = "Emotion recognition methods typically assign labels at the sentence level, obscuring the specific linguistic cues that signal emotion. This limits their utility in applications requiring targeted responses, such as empathetic dialogue and clinical support, which depend on knowing which language expresses emotion. The task of identifying \textit{emotion evidence} {--} text spans conveying emotion {--} remains underexplored due to a lack of labeled data. Without span-level annotations, we cannot evaluate whether models truly localize emotion expression, nor can we diagnose the sources of emotion misclassification. We introduce the SEER (Span-based Emotion Evidence Retrieval) Benchmark to evaluate Large Language Models (LLMs) on this task. SEER evaluates single and multi-sentence span identification with new annotations on 1200 real-world sentences. We evaluate 14 LLMs and find that, on single-sentence inputs, the strongest models match the performance of average human annotators, but performance declines in multi-sentence contexts. Key failure modes include fixation on emotion keywords and false positives in neutral text."
}Markdown (Informal)
[SEER: The Span-based Emotion Evidence Retrieval Benchmark](https://preview.aclanthology.org/ingest-ijcnlp-aacl/2025.findings-ijcnlp.76/) (Sampath et al., Findings 2025)
ACL
- Aneesha Sampath, Oya Aran, and Emily Mower Provost. 2025. SEER: The Span-based Emotion Evidence Retrieval Benchmark. In Proceedings of the 14th International Joint Conference on Natural Language Processing and the 4th Conference of the Asia-Pacific Chapter of the Association for Computational Linguistics, pages 1248–1267, Mumbai, India. The Asian Federation of Natural Language Processing and The Association for Computational Linguistics.