@inproceedings{rao-etal-2026-empath,
title = "{EMPATH}: An Ensemble Method for Automatic Fine-Grained Turn-Level Dialogue Empathy Evaluation with a Novel Emotional Distance Metric",
author = "Rao, Dongning and
Liang, Zhihua and
Jiang, Zhihua",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1790/",
pages = "35921--35942",
ISBN = "979-8-89176-395-1",
abstract = "Empathy is key to many professions. In recognition of this, the workshops on computational approaches to subjectivity, sentiment, and social media analysis (WASSA) hosted competitions to evaluate empathy in dialogue. While fine-tuning has proved successful in the competition, there are at least three shortcomings. First, novel metrics for empathy are absent. Second, classical dialogue evaluation metrics require further investigation. Third, the ensemble{'}s potential remained underdeveloped. To address these issues, we propose the EMPATH framework, which combines fine-tuned models, large language models, classical dialogue evaluation metrics, and a novel metric. The novel metric, ED, encourages the response{'}s emotional tone to be contextually appropriate. E.g., if the user expresses joy, a cheerful reaction should receive a higher ranking. Furthermore, we introduce a new robust and label-free ensemble strategy, HO, which integrates sub-metrics with the lowest correlation coefficient first. In addition to evaluating on the WASSA benchmark, we test EMPATH{'}s generalizability using the EmpatheticExchanges dataset (EX). Our experiment results demonstrate that EMPATH yields the best results on the competition dataset, and ablation studies validate our component selection. On EX, the Pearson correlation coefficient for the winner of WASSA 2024 is 0.4066, while EMPATH shows a statistically significant 8{\%} improvement (i.e., 0.4860)."
}