@inproceedings{yeh-etal-2026-lexical,
title = "Lexical Familiarity Predicts Processing Depth for Nonliteral Language in Large Language Models",
author = "Yeh, Lang-Ching and
Wang, Yu-Chieh and
Hsieh, Shu-Kai",
editor = "Chang, Kai-Wei and
Mehrabi, Ninareh and
Krishna, Satyapriya and
Das, Anubrata and
Dhamala, Jwala and
Cao, Yang Trista and
Kumarage, Tharindu and
Ramakrishna, Anil and
Christodoulopoulos, Christos and
Wan, Yixin and
Galystan, Aram and
Kumar, Anoop and
Gupta, Rahul",
booktitle = "Proceedings of the 6th Workshop on Trustworthy {NLP} ({T}rust{NLP} 2026)",
month = jul,
year = "2026",
address = "San Diego, California",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.trustnlp-main.32/",
pages = "456--470",
ISBN = "979-8-89176-418-7",
abstract = "This paper investigates how large language models internally process nonliteral language. Analyzing five categories spanning slang, metaphor, and idioms across all 48 layers of Gemma-3-12B-IT with Gemma Scope 2 sparse autoencoders, we find a lexical familiarity gradient: processing depth depends on available prior lexical knowledge, not figurative type. Idioms diverge at L1 as entrenched units; expressions built from familiar words (metaphors, semantic-shift and constructional slang) converge at L7{--}9; neologisms peak at L41, activating 3{\texttimes} more unique features. Paraphrase residual analysis confirms strong signals only at the gradient endpoints, yielding a three-tier hierarchy of entrenched retrieval, known-word reanalysis, and novel-word construction. Crucially, this peak-layer structure replicates in base models (Gemma-PT, Qwen-Base), demonstrating that the gradient is a robust property of pretrained representations rather than an alignment artifact. We additionally identify an activation density confound in SAE feature counts that produces spurious cross-condition convergence. Overall, processing depth is better predicted by lexical familiarity than by figurative type, with implications for robustness to non-standard language and for SAE-based interpretability."
}