@inproceedings{shen-han-2026-reliability,
title = "The Reliability Illusion in Synthetic Patients: Psychometric Misalignment of Open-weight {LLM}s on {PHQ}-9 and {GAD}-7",
author = "Shen, Qian and
Han, Yu",
editor = "Zirikly, Aya and
Bar, Kfir and
MacAvaney, Sean and
Ireland, Molly and
Ophir, Yaakov and
Atzil-Slonim, Dana and
Varadarajan, Vasudha and
Bedrick, Steven and
Desmet, Bart",
booktitle = "Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology ({CLP}sych 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.clpsych-1.7/",
pages = "88--99",
ISBN = "979-8-89176-421-7",
abstract = "Globally, the incidence of depression and anxiety continues to rise, and the importance of mental health assessment scales as diagnostic tools has grown accordingly. Researchers are increasingly employing generative AI to produce large volumes of items and entire scales, which in turn elevates the costs of validating their reliability and validity. In this study, we used four open-weight LLMs to complete the GAD-7 and PHQ-9, varying prompts, sampling temperature, and dynamic contextual scenarios to emulate realistic human response patterns. Using multi-group confirmatory factor analysis, differential item functioning analyses, and other psychometric methods, we evaluate the factor structure of LLM-generated responses and assess measurement invariance relative to human responses. Our findings reveal a critical paradox: although open-weight LLMs exhibit exceptionally high internal consistency, they demonstrate severe structural mismatch and fail to achieve scalar measurement invariance against human baselines. Furthermore, pervasive differential item functioning and extreme prompt fragility indicate that these models rely on superficial, stereotype-driven semantic matching rather than simulating stable latent psychological dynamics."
}Markdown (Informal)
[The Reliability Illusion in Synthetic Patients: Psychometric Misalignment of Open-weight LLMs on PHQ-9 and GAD-7](https://preview.aclanthology.org/ingest-acl-workshops/2026.clpsych-1.7/) (Shen & Han, CLPsych 2026)
ACL