@inproceedings{wen-etal-2025-evaluating,
title = "Evaluating Implicit Bias in Large Language Models by Attacking From a Psychometric Perspective",
author = "Wen, Yuchen and
Bi, Keping and
Chen, Wei and
Guo, Jiafeng and
Cheng, Xueqi",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/display_plenaries/2025.findings-acl.263/",
pages = "5081--5097",
ISBN = "979-8-89176-256-5",
abstract = "As large language models (LLMs) become an important way of information access, there have been increasing concerns that LLMs may intensify the spread of unethical content, including implicit bias that hurts certain populations without explicit harmful words. In this paper, we conduct a rigorous evaluation of LLMs' implicit bias towards certain demographics by attacking them from a psychometric perspective to elicit agreements to biased viewpoints. Inspired by psychometric principles in cognitive and social psychology, we propose three attack approaches, i.e., Disguise, Deception, and Teaching. Incorporating the corresponding attack instructions, we built two benchmarks: (1) a bilingual dataset with biased statements covering four bias types (2.7K instances) for extensive comparative analysis, and (2) BUMBLE, a larger benchmark spanning nine common bias types (12.7K instances) for comprehensive evaluation. Extensive evaluation of popular commercial and open-source LLMs shows that our methods can elicit LLMs' inner bias more effectively than competitive baselines. Our attack methodology and benchmarks offer an effective means of assessing the ethical risks of LLMs, driving progress toward greater accountability in their development."
}
Markdown (Informal)
[Evaluating Implicit Bias in Large Language Models by Attacking From a Psychometric Perspective](https://preview.aclanthology.org/display_plenaries/2025.findings-acl.263/) (Wen et al., Findings 2025)
ACL