@inproceedings{welivita-etal-2026-generates,
title = "Who Generates More Empathetic Responses{---}Humans or {LLM}s? A Comparative Evaluation with Human and {LLM} Judges",
author = "Welivita, Anuradha and
Zeitoun, Fawzia and
Pu, Pearl",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.21/",
pages = "358--381",
ISBN = "979-8-89176-410-1",
abstract = "This paper compares the empathetic quality of responses generated by humans and large language models (LLMs). We evaluate four LLMs that were widely used at the time of study{---}GPT-4, LLaMA-2-70B-Chat, Gemini-1.0-Pro, and Mixtral-8{\texttimes}7B-Instruct{---}against a human baseline using a large-scale between-subjects study. A total of 1,000 human participants evaluated the empathetic quality of human- and LLM-generated responses to 2,000 dialogue prompts spanning 32 positive and negative emotions. To complement human judgments, we also employed an LLM-as-judge (GPT-4o-mini) to assess the same responses. Across emotions and evaluators, LLM-generated responses were rated as significantly more empathetic than human-written responses. We also observed that both human judges and the LLM-as-judge tended to rate responses generated by their own group more favorably, indicating self-favoring tendencies. These findings highlight both the strong performance of contemporary LLMs in empathetic responding and the need to interpret human- and LLM-based evaluations with care."
}Markdown (Informal)
[Who Generates More Empathetic Responses—Humans or LLMs? A Comparative Evaluation with Human and LLM Judges](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.21/) (Welivita et al., CoNLL 2026)
ACL