@inproceedings{chen-etal-2025-steer,
title = "{STEER}-{BENCH}: A Benchmark for Evaluating the Steerability of Large Language Models",
author = "Chen, Kai and
He, Zihao and
Shi, Taiwei and
Lerman, Kristina",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.925/",
pages = "18338--18366",
ISBN = "979-8-89176-332-6",
abstract = "Steerability, or the ability of large language models (LLMs) to adapt outputs to align with diverse community-specific norms, perspectives, and communication styles, is critical for real-world applications but remains under-evaluated. We introduce STEER-BENCH, a benchmark for assessing population-specific steering using contrasting Reddit communities. Covering 30 contrasting subreddit pairs across 19 domains, STEER-BENCH includes over 10,000 instruction-response pairs and validated 5,500 multiple-choice questions with corresponding silver labels to test alignment with diverse community norms. It systematically assesses how effectively LLMs understand community-specific instructions, their resilience to adversarial steering attempts, and their ability to accurately represent diverse cultural and ideological perspectives. Our evaluation of 13 popular LLMs using STEER-BENCH reveals that while human experts achieve an accuracy of 81{\%} with silver labels, the best-performing models reach only around 65{\%} accuracy depending on the domain and configuration. Some models lag behind human-level alignment by over 15 percentage points, highlighting significant gaps in community-sensitive steerability."
}Markdown (Informal)
[STEER-BENCH: A Benchmark for Evaluating the Steerability of Large Language Models](https://preview.aclanthology.org/ingest-emnlp/2025.emnlp-main.925/) (Chen et al., EMNLP 2025)
ACL