@inproceedings{liu-etal-2024-summequal,
title = "{S}umm{EQ}u{AL}: Summarization Evaluation via Question Answering using Large Language Models",
author = "Liu, Junyuan and
Shi, Zhengyan and
Lipani, Aldo",
editor = "Dalvi Mishra, Bhavana and
Durrett, Greg and
Jansen, Peter and
Lipkin, Ben and
Neves Ribeiro, Danilo and
Wong, Lionel and
Ye, Xi and
Zhao, Wenting",
booktitle = "Proceedings of the 2nd Workshop on Natural Language Reasoning and Structured Explanations (@ACL 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.nlrse-1.5/",
pages = "46--55",
abstract = "Summarization is hard to evaluate due to its diverse and abstract nature. Although N-gram-based metrics like BLEU and ROUGE are prevalent, they often do not align well with human evaluations. While model-based alternatives such as BERTScore improve, they typically require extensive labelled data. The advent of Large Language Models (LLMs) presents a promising avenue for evaluation. To this end, we introduce SummEQuAL, a novel content-based framework using LLMs for unified, reproducible summarization evaluation. SummEQuAL evaluates summaries by comparing their content with the source document, employing a question-answering approach to gauge both recall and precision. To validate SummEQuAL`s effectiveness, we develop a dataset based on MultiWOZ. We conduct experiments on SummEval and our MultiWOZ-based dataset, showing that SummEQuAL largely improves the quality of summarization evaluation. Notably, SummEQuAL demonstrates a 19.7{\%} improvement over QuestEval in terms of sample-level Pearson correlation with human assessments of consistency on the SummEval dataset. Furthermore, it exceeds the performance of the BERTScore baseline by achieving a 17.3{\%} increase in Spearman correlation on our MultiWOZ-based dataset. Our study illuminates the potential of LLMs for a unified evaluation framework, setting a new paradigm for future summarization evaluation."
}
Markdown (Informal)
[SummEQuAL: Summarization Evaluation via Question Answering using Large Language Models](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.nlrse-1.5/) (Liu et al., NLRSE 2024)
ACL