@inproceedings{cyberey-etal-2025-unsupervised,
title = "Unsupervised Concept Vector Extraction for Bias Control in {LLM}s",
author = "Cyberey, Hannah and
Ji, Yangfeng and
Evans, David",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1439/",
pages = "28321--28343",
ISBN = "979-8-89176-332-6",
abstract = "Large language models (LLMs) are known to perpetuate stereotypes and exhibit biases. Various strategies have been proposed to mitigate these biases, but most work studies biases as a black-box problem without considering how concepts are represented within the model. We adapt techniques from representation engineering to study how the concept of ``gender'' is represented within LLMs. We introduce a new method that extracts concept representations via probability weighting without labeled data and efficiently selects a steering vector for measuring and manipulating the model{'}s representation. We develop a projection-based method that enables precise steering of model predictions and demonstrate its effectiveness in mitigating gender bias in LLMs and show that it also generalizes to racial bias."
}Markdown (Informal)
[Unsupervised Concept Vector Extraction for Bias Control in LLMs](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.1439/) (Cyberey et al., EMNLP 2025)
ACL