@inproceedings{wadi-fredette-2025-monte,
title = "A {M}onte-{C}arlo Sampling Framework For Reliable Evaluation of Large Language Models Using Behavioral Analysis",
author = "Wadi, Davood and
Fredette, Marc",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.500/",
doi = "10.18653/v1/2025.findings-emnlp.500",
pages = "9414--9432",
ISBN = "979-8-89176-335-7",
abstract = "Scientific evaluation of Large Language Models is an important topic that quantifies any degree of progress we make with new models. Even though current LLMs show high level of accuracy on benchmark datasets, the single-sample approach to evaluating them is not sufficient as it ignores high entropy of LLM responses. We introduce a Monte-Carlo evaluation framework for evaluating LLMs that follows behavioral science methodologies and provides statistical guarantees for estimates of performance. We test our framework on multiple LLMs to see if they are susceptible to cognitive biases. We find significant effect of prompts that induce cognitive biases in LLMs, raising questions about their reliability in social sciences and business. We also see higher susceptibility of newer and larger LLMs to cognitive biases, which shows a development towards more human-like and less rational LLM responses. We conclude by calling for the use of Monte-Carlo sampling as opposed to pass@1 for the broader LLM evaluations."
}Markdown (Informal)
[A Monte-Carlo Sampling Framework For Reliable Evaluation of Large Language Models Using Behavioral Analysis](https://preview.aclanthology.org/name-variant-enfa-fane/2025.findings-emnlp.500/) (Wadi & Fredette, Findings 2025)
ACL