@inproceedings{gu-etal-2026-beyond,
title = "Beyond Neural Incompatibility: Cross-Scale Knowledge Transfer in Large Language Models through Latent Semantic Alignment",
author = "Gu, Jian and
Aleti, Aldeida and
Chen, Chunyang and
Zhang, Hongyu",
editor = "Liakata, Maria and
Moreira, Viviane P. and
Zhang, Jiajun and
Jurgens, David",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {ACL} 2026",
month = jul,
year = "2026",
address = "San Diego, California, United States",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1101/",
pages = "21893--21905",
ISBN = "979-8-89176-395-1",
abstract = "Large Language Models (LLMs) encode substantial knowledge in their parameters, which can be located, traced, and analyzed. Despite recent progress in neural interpretability, it is still unclear how to transfer such knowledge in a fine-grained manner, namely parametric knowledge transfer (PKT). A central challenge is to make cross-scale transfer effective and efficient when source and target models differ in architecture and parameterization. Existing methods that directly reuse layer parameters are therefore strongly limited by neural incompatibility. In this paper, we identify latent semantic alignment as the key prerequisite for cross-scale knowledge transfer. Instead of directly moving layer parameters, our approach uses activations as the transfer medium. SemAlign has two stages: an \textit{layer attribution} stage that attributes task-relevant source layers and selects exactly one source layer for each target layer, and a \textit{semantic alignment} stage that pairs them from shallow to deep and optimizes the target with source-side supervisory hidden states. The alignment is carried out in latent space. In the current realization, training follows a shallow-to-deep frontier schedule: at each stage, only the current target layer is trainable, the layer objective is a Fisher-weighted quadratic surrogate on target-space aligned logits, and the final output layer keeps KL distillation. The transferred object nonetheless remains the aligned representation itself. Evaluations on four benchmarks demonstrate the efficacy of our method. Further analysis reveals the key factors that ease cross-scale knowledge transfer and provides insights into the nature of latent semantic alignment."
}Markdown (Informal)
[Beyond Neural Incompatibility: Cross-Scale Knowledge Transfer in Large Language Models through Latent Semantic Alignment](https://preview.aclanthology.org/ingest-acl/2026.findings-acl.1101/) (Gu et al., Findings 2026)
ACL