@inproceedings{aljaafari-etal-2026-bridging,
title = "Bridging Linguistic Structure and Mechanistic Interpretability for Conceptual Interpretation in Language Models",
author = "Aljaafari, Nura and
Carvalho, Danilo and
Freitas, Andre",
editor = "Bonial, Claire and
Berzak, Yevgeni",
booktitle = "Proceedings of the 30th Conference on Computational Natural Language Learning",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.44/",
pages = "722--741",
ISBN = "979-8-89176-410-1",
abstract = "Understanding how language models compose meaning from linguistic input remains a central problem in interpretability research. Mechanistic studies have attributed functional roles to core transformer components; however, these findings derive largely from factual retrieval settings. Whether the same mechanisms support \textit{conceptual interpretation}, the compositional mapping from definitional expressions to abstract meaning, remains insufficiently characterised. We introduce \textit{DSRA} (Definitional Semantic Role Analysis), a methodology that applies causal tracing within the reverse dictionary task and augments restoration traces with definitional semantic roles (DSRs) grounded in Argument Structure Theory. This linguistic overlay identifies which compositional functions (e.g., genus, differentia quality) are associated with high-recovery states, extending activation patching beyond token-level localisation. Applied to GPT-J-6B (English) and BERTIN GPT-J-6B (Spanish), the results show that MLP layers associate content-bearing tokens with high-specificity DSR categories in early layers, MHA layers distribute integration across middle-to-upper layers with concentration at the final token, and hidden states aggregate information in upper layers. Alignment between restored states and DSR categories indicates systematic correspondence between internal activations and definitional structure, with consistent localisation patterns across both languages."
}Markdown (Informal)
[Bridging Linguistic Structure and Mechanistic Interpretability for Conceptual Interpretation in Language Models](https://preview.aclanthology.org/ingest-acl-workshops/2026.conll-main.44/) (Aljaafari et al., CoNLL 2026)
ACL