@inproceedings{kumar-2026-pressure,
title = "Pressure-Testing Deception Probes in {LLM}s: Scaling, Robustness, and the Geometry of Deceptive Representations",
author = "Kumar, Sachin",
editor = "Mille, Simon and
Gehrmann, Sebastian and
Schmidtov{\'a}, Patr{\'i}cia and
Du{\v{s}}ek, Ond{\v{r}}ej and
Fadaee, Marzieh and
Lo, Kyle and
Santus, Enrico and
Stanovsky, Gabriel",
booktitle = "Proceedings of the Fifth Workshop on Generation, Evaluation and Metrics ({GEM})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.43/",
pages = "472--489",
ISBN = "979-8-89176-423-1",
abstract = "Linear probes trained on internal activations of Large Language Models (LLMs) are increasingly proposed as evaluation metrics for deceptive generation, automated monitors that score whether a model{'}s output was produced deceptively, without requiring ground-truth labels or human annotation. Yet these metrics report AUROC scores exceeding 0.96 on clean benchmarks while demonstrating profound fragility under distributional shift. This paper presents a systematic pressure-test of such probe-based evaluation metrics across the Gemma 3 model family (1B{--}27B parameters), diagnosing why they fail rather than merely documenting that they fail. We investigate four competing hypotheses about how deception is encoded: as (1) a single linear direction, (2) a multi-dimensional subspace, (3) a convex conic hull, or (4) a proxy for computational entropy. Our experimental design includes cross-domain transfer matrices, multi-dimensional probe analysis with permutation null baselines, entropy-residualization tests, and systematic distractor evaluations across 8 stylistic shifts. Across all four model scales, we find that: (a) probe-based metrics achieve near-perfect AUROC ($\ge$0.998) on clean data but collapse under stylistic shifts when trained without stylistic augmentation, style-augmented probes recover near-perfect detection (mean AUROC{~}0.979{--}0.983) even on unseen styles; (b) the single-direction hypothesis is decisively rejected (k=1 captures only 0.61{--}0.80 AUROC of the signal, with cross-domain transfer failure confirmed as geometric rather than layer-mismatch-driven; (c) the entropy-proxy hypothesis is rejected (maximum $|\rho|=0.454$, maximum $\Delta$AUROC after residualization=0.004); and (d) deception does not form a statistically significant linear subspace even within individual domains (per-domain $k^*{=}0$), yet multi-dimensional probes (k$\ge$5) consistently recover the signal through distributed sub-threshold features. These findings demonstrate that probe fragility under standard training reflects distributional narrowness rather than a fundamental architectural limitation: style-augmented probes recover near-perfect detection (mean AUROC 0.979{--}0.983 on unseen styles) at both the 4B and 27B scales, establishing that the inverse scaling pattern observed under standard training is a training-distribution artifact rather than a genuine scale-dependent phenomenon."
}Markdown (Informal)
[Pressure-Testing Deception Probes in LLMs: Scaling, Robustness, and the Geometry of Deceptive Representations](https://preview.aclanthology.org/ingest-acl-workshops/2026.gem-main.43/) (Kumar, GEM 2026)
ACL