@inproceedings{duszenko-etal-2026-sycophantic,
title = "Sycophantic Anchors: Localizing and Quantifying User Agreement in Reasoning Models",
author = "Duszenko, Jacek and
Kazienko, Przemyslaw and
Kocon, Jan",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.20/",
pages = "225--239",
ISBN = "979-8-89176-393-7",
abstract = "Reasoning models frequently agree with incorrect user suggestions - a behavior known as sycophancy. However, it is unclear where in the reasoning trace this agreement originates and how strong the commitment is. We introduce sycophantic anchors - sentences identified via counterfactual analysis that commit models to user agreement. Across four reasoning models spanning three architecture families (Llama, Qwen, Falcon-hybrid) and 1.5B - 8B parameters, we analyze over 200,000 counterfactual rollouts and show that linear probes reliably detect sycophantic anchors (74 - 85{\%} balanced accuracy), outperforming text-only baselines at high commitment levels -confirming they capture internal states beyond surface vocabulary. Regressors further predict commitment strength from activations ($R^2$ up to 0.74). We observe a consistent asymmetry: sycophancy leaves a stronger mechanistic footprint than correct reasoning. We also find that sycophancy builds gradually during generation rather than being determined by the prompt. These findings enable sentence-level detection and quantification of model misalignment mid-inference."
}Markdown (Informal)
[Sycophantic Anchors: Localizing and Quantifying User Agreement in Reasoning Models](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.20/) (Duszenko et al., ACL 2026)
ACL