@inproceedings{yuan-etal-2026-paths,
title = "Where Paths Split: Localized, Calibrated Control of Moral Reasoning in Large Language Models",
author = "Yuan, Chenchen and
Zhang, Zheyu and
Kasneci, Gjergji",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-long.1933/",
pages = "41698--41721",
ISBN = "979-8-89176-390-6",
abstract = "Large language models often display heterogeneous moral preferences across settings. We study inference-time steering toward a desired ethical framework while preserving general competence. We present Convergent-Divergent Routing, which traces and edits minimal branch points inside transformer blocks where ethical-framework-related pathways first converge and then diverge. Gating non-target branches at these loci blocks the downstream propagation while leaving upstream computations intact. We find that this intervention alone increases targeted ethical-framework reasoning. To achieve fine-grained control, we adapt Common Spatial Patterns to the residual stream and extract, for each branch-point layer, a pair of directions that discriminate between utilitarian and deontological frameworks. We then introduce Dual Logit Calibration, a closed-form, minimum-$\ell_2$-norm update that moves the residual within this two-dimensional subspace so the resulting directional projections align with user-specified preference weights. Experiments on real-life moral dilemmas show that our method reliably achieves preference calibration and largely preserves general capabilities, outperforming recent baselines while providing an interpretable mechanism."
}Markdown (Informal)
[Where Paths Split: Localized, Calibrated Control of Moral Reasoning in Large Language Models](https://preview.aclanthology.org/ingest-acl/2026.acl-long.1933/) (Yuan et al., ACL 2026)
ACL