@inproceedings{sanchez-carmona-etal-2026-randomized,
title = "Randomized Controlled Trials as the Gold-Standard for Evaluating {LLM}s: A Primer for Biomedical {NLP} Researchers",
author = "Sanchez Carmona, Vicente Ivan and
Jiang, Shanshan and
Dong, Bin",
editor = "Demner-Fushman, Dina and
Ananiadou, Sophia and
Roberts, Kirk and
Tsujii, Junichi",
booktitle = "{B}io{NLP} 2026",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.31/",
pages = "392--406",
ISBN = "979-8-89176-434-7",
abstract = "Large Language Models (LLMs) are no longer mere laboratory objects of study. LLMs have become everyday tools in society across diverse populations and domains. In clinical contexts, LLMs have already been devised as clinical support applications. However, along with benefits, negative or adverse effects might arise, such as LLMs potentially providing psychologically distressing advice to adolescents when used for mental health support. This raises questions on the benefits of LLMs and calls for real-world evaluations: Are LLMs really helpful and effective for the intended purposes people are using them or will use them for? To answer this type of question we propose to use Randomized Controlled Trials (RCTs). RCTs are considered the most strict experimental design in the fields of Medicine, Psychiatry, Psychology, among others; however, the use of RCTs in the NLP field is almost negligible. In spite of the NLP field being the de facto locus of research on LLMs, other fields, prominently Medicine, are leading the RCT evaluations on LLMs. In this primer paper, we present a concise introduction to the principles of RCTs to guide NLP researchers to design RCT studies for evaluating LLMs."
}Markdown (Informal)
[Randomized Controlled Trials as the Gold-Standard for Evaluating LLMs: A Primer for Biomedical NLP Researchers](https://preview.aclanthology.org/ingest-acl-workshops/2026.bionlp-1.31/) (Sanchez Carmona et al., BioNLP 2026)
ACL