@inproceedings{hao-etal-2026-alignment,
title = "What Does Alignment Cost? The Structural Brittleness of Chain-of-Thought Reasoning",
author = "Hao, Joanna and
Jiang, Shanduojiao and
Nakka, Sai Asish",
editor = "Chen, Canyu and
Zhang, Yuji and
Li, Zoey Sha and
Wang, Zihan and
Wang, Qineng and
Su, Jinyan and
Kargupta, Priyanka and
Marjanovi{\'c}, Sara Vera and
Pan, Jeff Z. and
Bansal, Mohit and
Augenstein, Isabelle and
Han, Jiawei and
Ji, Heng and
Li, Manling",
booktitle = "Proceedings of the 4th Workshop on Towards Knowledgeable Foundation Models ({K}now{FM} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.knowfm-1.3/",
pages = "25--33",
ISBN = "979-8-89176-403-3",
abstract = "While Chain-of-Thought (CoT) prompting enables Large Language Models to explicitly justify their predictions, the extent to which these textual rationales faithfully reflect internal computation remains unclear. We investigate the circuit-level impact of alignment by performing a strict within-family comparison of the 1B-parameter Llama 3 architecture (Base vs. Instruct). Executing dynamic circuit discovery and dual-direction resample ablation on unconstrained CoT traces across synthetic mathematical primitives and a GSM8K proxy, we find that foundation models possess highly redundant, self-repairing computational networks; completely corrupting their primary reasoning circuits yields a minimal performance drop (2.92{\%}) due to the dynamic compensation of backup heads (the Hydra Effect). In contrast, the instruction-tuned model exhibits reduced structural redundancy, suffering more than double the degradation (6.79{\%}) under identical perturbation. We formalize our observation as an ``Alignment Tax on Redundancy'': optimizing for human-preference compliance repurposes dormant backup circuits, centralizing mathematical routing and rendering the aligned model{'}s reasoning pathways significantly more vulnerable to internal perturbation."
}Markdown (Informal)
[What Does Alignment Cost? The Structural Brittleness of Chain-of-Thought Reasoning](https://preview.aclanthology.org/ingest-acl-workshops/2026.knowfm-1.3/) (Hao et al., KnowFM 2026)
ACL