@inproceedings{bugaud-2026-domain,
title = "Domain-Dependent Safety Behavior in Open-Weight {LLM}s: An Empirical Study Across Seven Ethical Domains",
author = "Bugaud, Zacharie",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.42/",
pages = "557--562",
ISBN = "979-8-89176-418-7",
abstract = "We present a systematic study of domain-dependent safety behavior in open-weight LLMs: 7 standardized experiments across 7 ethical domains, testing 5 models (12B{--}70B) in 4,200 interactions with dual-judge validation. Using a dual-condition methodology, each scenario tested in both an analytical framing (identify the harm) and an operational framing (help commit the harm), we find compliance rates vary from 14.7{\%} (human trafficking) to 85.7{\%} (surveillance design), a 71-percentage-point span with non-overlapping cluster-bootstrapped 95{\%} CIs. Domain accounts for 36{\%} of pair-level variance in harm scores, with scenario (26{\%}) exceeding model identity (15{\%}). A stable model safety hierarchy persists across domains (mean Spearman {\ensuremath{\rho}} = 0.68). These findings demonstrate that safety alignment is not a general capability: aggregate safety scores mask critical domain-level variation, motivating domain-specific safety auditing for trustworthy deployment."
}Markdown (Informal)
[Domain-Dependent Safety Behavior in Open-Weight LLMs: An Empirical Study Across Seven Ethical Domains](https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.42/) (Bugaud, TrustNLP 2026)
ACL