@inproceedings{thomson-belz-2024-mostly-automatic,
title = "(Mostly) Automatic Experiment Execution for Human Evaluations of {NLP} Systems",
author = "Thomson, Craig and
Belz, Anya",
editor = "Mahamood, Saad and
Minh, Nguyen Le and
Ippolito, Daphne",
booktitle = "Proceedings of the 17th International Natural Language Generation Conference",
month = sep,
year = "2024",
address = "Tokyo, Japan",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2024.inlg-main.22/",
pages = "272--279",
abstract = "Human evaluation is widely considered the most reliable form of evaluation in NLP, but recent research has shown it to be riddled with mistakes, often as a result of manual execution of tasks. This paper argues that such mistakes could be avoided if we were to automate, as much as is practical, the process of performing experiments for human evaluation of NLP systems. We provide a simple methodology that can improve both the transparency and reproducibility of experiments. We show how the sequence of component processes of a human evaluation can be defined in advance, facilitating full or partial automation, detailed preregistration of the process, and research transparency and repeatability."
}
Markdown (Informal)
[(Mostly) Automatic Experiment Execution for Human Evaluations of NLP Systems](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2024.inlg-main.22/) (Thomson & Belz, INLG 2024)
ACL