@inproceedings{sandhan-etal-2025-cape,
title = "{CAPE}: Context-Aware Personality Evaluation Framework for Large Language Models",
author = "Sandhan, Jivnesh and
Cheng, Fei and
Sandhan, Tushar and
Murawaki, Yugo",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.563/",
doi = "10.18653/v1/2025.findings-emnlp.563",
pages = "10648--10662",
ISBN = "979-8-89176-335-7",
abstract = "Psychometric tests, traditionally used to assess humans, are now being applied to Large Language Models (LLMs) to evaluate their behavioral traits. However, existing studies follow a context-free approach, answering each question in isolation to avoid contextual influence. We term this the Disney World test, an artificial setting that ignores real-world applications, where conversational history shapes responses. To bridge this gap, we propose the first Context-Aware Personality Evaluation (CAPE) framework for LLMs, incorporating prior conversational interactions. To thoroughly analyze the influence of context, we introduce novel metrics to quantify the consistency of LLM responses, a fundamental trait in human behavior. Our exhaustive experiments on 7 LLMs reveal that conversational history enhances response consistency via in-context learning but also induces personality shifts, with $\texttt{GPT-3.5-Turbo}$ and $\texttt{GPT-4-Turbo}$ exhibiting extreme deviations. While $\texttt{GPT}$ models are robust to question ordering, $\texttt{Gemini-1.5-Flash}$ and $\texttt{Llama-8B}$ display significant sensitivity. Moreover, $\texttt{GPT}$ models response stem from their intrinsic personality traits as well as prior interactions, whereas $\texttt{Gemini-1.5-Flash}$ and $\texttt{Llama-8B}$ heavily depend on prior interactions. Finally, applying our framework to Role Playing Agents (RPAs) shows context-dependent personality shifts improve response consistency and better align with human judgments."
}Markdown (Informal)
[CAPE: Context-Aware Personality Evaluation Framework for Large Language Models](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.563/) (Sandhan et al., Findings 2025)
ACL