@inproceedings{burleigh-etal-2025-beyond,
title = "Beyond the Hint: Using Self-Critique to Constrain {LLM} Feedback in Conversation-Based Assessment",
author = "Burleigh, Tyler and
Han, Jenny and
Dicerbo, Kristen",
editor = "Wilson, Joshua and
Ormerod, Christopher and
Beiting Parrish, Magdalen",
booktitle = "Proceedings of the Artificial Intelligence in Measurement and Education Conference (AIME-Con): Coordinated Session Papers",
month = oct,
year = "2025",
address = "Wyndham Grand Pittsburgh, Downtown, Pittsburgh, Pennsylvania, United States",
publisher = "National Council on Measurement in Education (NCME)",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-sessions.9/",
pages = "79--85",
ISBN = "979-8-218-84230-7",
abstract = "Large Language Models in Conversation-Based Assessment tend to provide inappropriate hints that compromise validity. We demonstrate that self-critique {--} a simple prompt engineering technique {--} effectively constrains this behavior.Through two studies using synthetic conversations and real-world high school math pilot data, self-critique reduced inappropriate hints by 90.7{\%} and 24-75{\%} respectively. Human experts validated ground truth labels while LLM judges enabled scale. This immediately deployable solution addresses the critical tension in intermediate-stakes assessment: maintaining student engagement while ensuring fair comparisons. Our findings show prompt engineering can meaningfully safeguard assessment integrity without model fine-tuning."
}Markdown (Informal)
[Beyond the Hint: Using Self-Critique to Constrain LLM Feedback in Conversation-Based Assessment](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.aimecon-sessions.9/) (Burleigh et al., AIME-Con 2025)
ACL