@inproceedings{montesano-etal-2026-mostly,
title = "Mostly Grounded, Occasionally Risky: Expert Evaluation of {LLM}-Generated Supervisory Feedback in a Psychotherapy Training Simulator",
author = "Montesano, Adrian and
Bloomberg, Justin and
P{\'e}rez-Buriel, Marc",
editor = "Zirikly, Aya and
Bar, Kfir and
MacAvaney, Sean and
Ireland, Molly and
Ophir, Yaakov and
Atzil-Slonim, Dana and
Varadarajan, Vasudha and
Bedrick, Steven and
Desmet, Bart",
booktitle = "Proceedings of the 10th Workshop on Computational Linguistics and Clinical Psychology ({CLP}sych 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.clpsych-1.24/",
pages = "298--305",
ISBN = "979-8-89176-421-7",
abstract = "Automated feedback is increasingly cited as a key advantage of AI-based psychotherapy training, yet the clinical groundedness of LLM-generated supervisory feedback remains unevaluated. We present an expert evaluation of supervisory feedback generated by PRACTICE, an LLM-powered open-ended psychotherapy training simulator, across 21 feedback instances from four novice trainees. Two clinical psychology experts independently coded 167 feedback propositions as Justified, Unjustified, or Unsure. Inter-rater reliability was near-perfect (raw agreement = 98.2{\textbackslash}{\%}; {\$}{\textbackslash}kappa{\$} = 0.902). Of the 167 propositions, 149 (89.2{\textbackslash}{\%}) were rated Justified; however, 52.4{\textbackslash}{\%} of feedback instances contained at least one non-justified proposition, and qualitative analysis identified three recurring failure types: incorrect characterization, referential imprecision, and unclear communication. In clinical training contexts, even low error rates carry ethical weight: unjustified feedback risks reinforcing inappropriate clinical behaviors in trainees that can be trasnferred to real practice. These findings provide an initial empirical basis for the responsible deployment of LLM-generated feedback in clinical training and call for traceable, expert-auditable feedback architectures."
}Markdown (Informal)
[Mostly Grounded, Occasionally Risky: Expert Evaluation of LLM-Generated Supervisory Feedback in a Psychotherapy Training Simulator](https://preview.aclanthology.org/ingest-acl-workshops/2026.clpsych-1.24/) (Montesano et al., CLPsych 2026)
ACL