@inproceedings{mille-lorandi-2025-reprohum,
title = "{R}epro{H}um {\#}0729-04: Partial reproduction of the human evaluation of the {M}em{S}um and {N}eu{S}um summarisation systems",
author = "Mille, Simon and
Lorandi, Michela",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.57/",
pages = "615--621",
ISBN = "979-8-89176-261-9",
abstract = "In this paper, we present our reproduction of part of the human evaluation originally carried out by Gu et al. (2022), as part of Track B of ReproNLP 2025. Four human annotators were asked to rank two candidate summaries according to their overall quality, given a reference summary shown alongside the two candidate summaries at evaluation time. We describe the original experiment and provide details about the steps we followed to carry out the reproduction experiment, including the implementation of some missing pieces of code. Our results, in particular the high coefficients of variation and low inter-annotator agreement, suggest a low level of reproducibility in the original experiment despite identical pairwise ranks. However, given the very small sample size (two systems, one rating), we remain cautious about drawing definitive conclusions."
}
Markdown (Informal)
[ReproHum #0729-04: Partial reproduction of the human evaluation of the MemSum and NeuSum summarisation systems](https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.57/) (Mille & Lorandi, GEM 2025)
ACL