@inproceedings{wang-etal-2026-resolving,
title = "Resolving the Security-Auditability Dilemma with Auditable Latent Chain-of-Thought Alignment",
author = "Wang, Guan and
Zhou, Biyu and
Tang, Xuehai and
Han, Jizhong and
Hu, Songlin",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1570/",
pages = "34051--34067",
ISBN = "979-8-89176-390-6",
abstract = "To address the increasingly severe safety risk of large language models (LLMs), reasoning-based safety alignment methods have emerged. These methods overcome the limitations of `shallow alignment' by exposing the model{'}s Chain-of-Thought (CoT), enabling auditability of safety reasoning process through both training-phase supervision and post-generation verification. However, this transparency creates a critical vulnerability, a tension we define as the \textbf{Security Auditability Dilemma}: while explicit reasoning is a prerequisite for safety, its textual Auditable paradoxically transforms it into an optimization target for adaptive attackers and induces the model to unintentionally copy harmful content from its own reasoning context. To address this, we propose \textbf{Auditable Latent CoT Alignment (ALCA)}, a framework that decouples internal reasoning from external output. ALCA shifts the safety deliberation process into a continuous latent space. This allows the safety reasoning process to guide the generation of harmless outputs, while eliminates the discrete textual surface that facilitates internal copying and adaptive attack. Yet, this process is not a black box. we introduce a restricted \textbf{Self-Decoding} mechanism that allows the model to reconstruct its latent reasoning into human-readable text for supervision under specific guidance. Extensive experiments show that ALCA achieves robustness alignment, reducing the success rate of adaptive jailbreak attacks by over 40{\%} compared to strong baselines, while preserving performance. Our framework presents a path toward building LLMs that are both robustly secure and auditable."
}Markdown (Informal)
[Resolving the Security-Auditability Dilemma with Auditable Latent Chain-of-Thought Alignment](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1570/) (Wang et al., ACL 2026)
ACL