@article{waldis-etal-2026-aligned, title = "Aligned Probing: Relating Toxic Behavior and Model Internals", author = "Waldis, Andreas and Gautam, Vagrant and Lauscher, Anne and Klakow, Dietrich and Gurevych, Iryna", journal = "Transactions of the Association for Computational Linguistics", volume = "14", year = "2026", address = "Cambridge, MA", publisher = "MIT Press", url = "https://preview.aclanthology.org/ingest-latest-mitpress-cl-tacl/2026.tacl-1.14/", doi = "10.1162/tacl.a.613", pages = "271--291" }