@inproceedings{jung-etal-2025-model,
title = "How Model Size, Temperature, and Prompt Style Affect {LLM}-Human Assessment Score Alignment",
author = "Jung, Julie and
Lu, Max and
Benker, Sina Chole and
Darici, Dogus",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Full Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-main.28/",
pages = "265--273",
ISBN = "979-8-218-84228-4",
abstract = "We examined how model size, temperature, and prompt style affect Large Language Models' (LLMs) alignment with human raters in assessing clinical reasoning skills. Model size emerged as a key factor in LLM-human score alignment. Findings reveal both the potential for scalable LLM-raters and the risks of relying on them exclusively."
}Markdown (Informal)
[How Model Size, Temperature, and Prompt Style Affect LLM-Human Assessment Score Alignment](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-main.28/) (Jung et al., AIME-Con 2025)
ACL
- Julie Jung, Max Lu, Sina Chole Benker, and Dogus Darici. 2025. How Model Size, Temperature, and Prompt Style Affect LLM-Human Assessment Score Alignment. In Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Full Papers, pages 265–273, Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States. National Council on Measurement in Education (NCME).