@inproceedings{benito-santos-etal-2025-robust,
title = "Robust Estimation of Population-Level Effects in Repeated-Measures {NLP} Experimental Designs",
author = "Benito-Santos, Alejandro and
Ghajari, Adrian and
Fresno, V{\'i}ctor",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1586/",
pages = "33076--33089",
ISBN = "979-8-89176-251-0",
abstract = "NLP research frequently grapples with multiple sources of variability{---}spanning runs, datasets, annotators, and more{---}yet conventional analysis methods often neglect these hierarchical structures, threatening the reproducibility of findings. To address this gap, we contribute a case study illustrating how linear mixed-effects models (LMMs) can rigorously capture systematic language-dependent differences (i.e., population-level effects) in a population of monolingual and multilingual language models. In the context of a bilingual hate speech detection task, we demonstrate that LMMs can uncover significant population-level effects{---}even under low-resource (small-N) experimental designs{---}while mitigating confounds and random noise. By setting out a transparent blueprint for repeated-measures experimentation, we encourage the NLP community to embrace variability as a feature, rather than a nuisance, in order to advance more robust, reproducible, and ultimately trustworthy results."
}
Markdown (Informal)
[Robust Estimation of Population-Level Effects in Repeated-Measures NLP Experimental Designs](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1586/) (Benito-Santos et al., ACL 2025)
ACL