@inproceedings{arvan-parde-2025-reprohum,
title = "{R}epro{H}um: {\#}0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations",
author = "Arvan, Mohammad and
Parde, Natalie",
editor = "Dhole, Kaustubh and
Clinciu, Miruna",
booktitle = "Proceedings of the Fourth Workshop on Generation, Evaluation and Metrics (GEM{\texttwosuperior})",
month = jul,
year = "2025",
address = "Vienna, Austria and virtual meeting",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.54/",
pages = "590--600",
ISBN = "979-8-89176-261-9",
abstract = "Reproducibility remains a fundamental challenge for human evaluation in Natural Language Processing (NLP), particularly due to the inherent subjectivity and variability of human judgments. This paper presents a reproduction study of the human evaluation protocol introduced by Hosking and Lapata (2021), which assesses semantic preservation in paraphrase generation models. By faithfully reproducing the original experiment with careful adaptation and applying the Quantified Reproducibility Assessment framework (Belz and Thomson, 2024a; Belz, 2022), we demonstrate strong agreement with the original findings, confirming the semantic preservation ranking among four paraphrase models. Our analyses reveal moderate inter-annotator agreement and low variability in key results, underscoring a good degree of reproducibility despite practical deviations in participant recruitment and platform. These findings highlight the feasibility and challenges of reproducing human evaluation studies in NLP. We discuss implications for improving methodological rigor, transparent reporting, and standardized protocols to bolster reproducibility in future human evaluations. The data and analysis scripts are publicly available to support ongoing community efforts toward reproducible evaluation in NLP and beyond."
}
Markdown (Informal)
[ReproHum: #0744-02: Investigating the Reproducibility of Semantic Preservation Human Evaluations](https://preview.aclanthology.org/corrections-2025-08/2025.gem-1.54/) (Arvan & Parde, GEM 2025)
ACL