@inproceedings{liu-etal-2026-reasoning,
title = "Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models",
author = "Liu, Yuansen and
Tang, Yixuan and
Tung, Anthony Kum Hoe",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/check-for-anonymous-pdfs/2026.acl-long.1698/",
pages = "36646--36665",
ISBN = "979-8-89176-390-6",
abstract = "Current LLM safety research predominantly focuses on mitigating **Goal Hijacking**, preventing attackers from redirecting a model{'}s high-level objective (e.g., from ``summarizing emails'' to ``phishing users''). In this paper, we argue that this perspective is incomplete and highlight a critical vulnerability in **Reasoning Alignment**. We expose the inherent fragility of current alignment techniques by proposing a new adversarial prompt attack paradigm: **Reasoning Hijacking**. To demonstrate this vulnerability, we instantiate it via the **Criteria Attack**, which subverts model judgments by injecting spurious decision criteria without altering the high-level task goal. Unlike Goal Hijacking, which attempts to override the system prompt, Reasoning Hijacking keeps the task goal intact but manipulates the model{'}s decision-making logic by injecting spurious reasoning shortcuts. Through extensive experiments on three different tasks (toxic comment, negative review, and spam detection), we demonstrate that even state-of-the-art models are highly fragile, consistently prioritizing injected heuristic shortcuts over rigorous semantic analysis. Crucially, because the model{'}s explicit intent remains aligned with the user{'}s instructions, these attacks can bypass defenses designed to detect goal deviation (e.g., SecAlign, StruQ), revealing a fundamental blind spot in the current safety landscape. Data and code are available at [https://github.com/Yuan-Hou/criteria{\_}attack](https://github.com/Yuan-Hou/criteria{\_}attack)."
}Markdown (Informal)
[Reasoning Hijacking: The Fragility of Reasoning Alignment in Large Language Models](https://preview.aclanthology.org/check-for-anonymous-pdfs/2026.acl-long.1698/) (Liu et al., ACL 2026)
ACL