@inproceedings{ayadi-hautli-janisz-2026-probing,
title = "Probing Bias Formation in Medical {LLM}s through Activation Steering",
author = "Ayadi, Bayram and
Hautli-Janisz, Annette",
editor = "T.Y.S.S., Santosh and
Rodriguez, Juan Diego and
de Gibert, Ona",
booktitle = "Proceedings of the 64th Annual Meeting of the {A}ssociation for {C}omputational {L}inguistics ({ACL} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.acl-srw.54/",
pages = "609--620",
ISBN = "979-8-89176-393-7",
abstract = "Large Language Models specialized for the medical domain achieve high performance on static benchmarks, but remain vulnerable to sycophantic confabulation, where the models generate medically spurious rationales to justify incorrect user hints. This robustness gap poses severe risks in clinical environments, as models may prioritize contextual faithfulness to a biased prompt over their internal parametric medical knowledge. This study introduces a mechanistic approach to identify and mitigate these failures in MedGemma-27B, isolating hint integration circuits using Sparse Autoencoders and geometric manifold analysis. Our findings reveal that sycophantic bias is a highly distributed and polymorphic concept, with biased reasoning routed through shifting dimensions across transformer layers. We identify the optimal layer for intervention and demonstrate that cluster-conditioned dynamic steering tailored to the geometric subspace of the prompt outperforms static global interventions, though it reveals a fundamental tension between bias resilience and the retention of internal parametric knowledge. This work proposes a principled framework toward clinical AI systems that are more robust and aligned with expert medical logic, demonstrating the potential of cluster-conditioned geometric interventions while characterizing the inherent trade-offs in clinical knowledge retention."
}Markdown (Informal)
[Probing Bias Formation in Medical LLMs through Activation Steering](https://preview.aclanthology.org/ingest-acl/2026.acl-srw.54/) (Ayadi & Hautli-Janisz, ACL 2026)
ACL