@article{jafari-etal-2026-towards,
title = "Towards Reliable Evaluation of Emotional Text Generation in {LLM}s: Human vs. Automatic Metrics",
author = "Jafari, Sadegh and
Lefever, Els and
Hoste, Veronique",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.222/",
pages = "2836--2847",
abstract = "Evaluating emotion generation in large language models (LLMs) remains a challenging problem due to the subjective nature of emotions and the lack of reliable automatic evaluation metrics. In this paper, we introduce a robust and extensible benchmark for systematically assessing automatic metrics in emotion generation tasks. The benchmark currently includes 13 automatic evaluation metrics and five state-of-the-art LLMs, and can be easily extended without requiring additional human annotations. Through a correlation analysis with human evaluations on a carefully curated annotated subset, we identify the emotion recognition score (ERS) metric, computed with gpt-5-nano in an oneshot setting, as the most reliable automatic evaluator, achieving a correlation exceeding 0.99. Interestingly, despite relying on the same underlying LLM, the emotion absolute score (EAS) metric shows a negative correlation, demonstrating that LLM strength alone does not guarantee automatic metric alignment with human judgment. We also provide lightweight, non-LLM-based alternatives, R2{\_}m and R3{\_}m, in the emotion analogy score (EAnS) metric family, suitable for low-resource settings where large models are not accessible. A comprehensive per-class emotion analysis further highlights the strengths and weaknesses of the evaluated models. Overall, our results offer a practical and scalable framework for benchmarking emotion generation evaluation metrics and pave the way for more reliable, fair, and interpretable emotional language evaluation."
}Markdown (Informal)
[Towards Reliable Evaluation of Emotional Text Generation in LLMs: Human vs. Automatic Metrics](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.222/) (Jafari et al., LREC 2026)
ACL