@inproceedings{halloran-2026-understanding,
title = "Understanding the Effects of Safety Unalignment on Reasoning- and Instruction-Tuned Large Language Models",
author = "Halloran, John Timothy",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.20/",
pages = "330--341",
ISBN = "979-8-89176-418-7",
abstract = "Alignment has become a critical step towards enabling large language model (LLM) safety guardrails which ensure models provide helpful and harmless responses, while refusing malicious and harmful requests. However, two separate lines of recent work{--}unalignment via fine-tuning, i.e., jailbreak-tuning (JT), and weight orthogonalization (WO){--}have shown that LLM guardrails may be circumvented, such that LLMs obey harmful requests which they would normally refuse. Despite the safety implications of such unalignment procedures, a comprehensive analysis directly contrasting these methods is currently lacking, as is a study of these methods' impact on malicious LLM capabilities and reasoning models. Using both JT and WO, we study the impact of unaligning six popular LLMs{--}three reasoning LLMs of various sizes and their instruction-tuned analogues{--}across harmful safety tasks. Compared to JT, we show that WO produces models which are more effective at adversarially attacking LLMs{--}in particular, WO reasoning LLMs excel at such adversarial attacks. Interestingly, while increasing adversarial attack efficacy, we show that WO does not drastically increase hallucination rates. This is in stark contrast to JT, which may more than double the hallucination rate of both reasoning and instruction-tuned models alike. Finally, we show that off-the-shelf supervised fine-tuning effectively limits the adversarial attack abilities enabled by WO, without drastically increasing hallucination rates."
}