@inproceedings{jha-reagen-2025-spectral,
title = "Spectral Scaling Laws in Language Models: emph{How Effectively Do Feed-Forward Networks Use Their Latent Space?}",
author = "Jha, Nandan Kumar and
Reagen, Brandon",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Proceedings of the 2025 Conference on Empirical Methods in Natural Language Processing",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.emnlp-main.1776/",
doi = "10.18653/v1/2025.emnlp-main.1776",
pages = "35047--35058",
ISBN = "979-8-89176-332-6",
abstract = "As Large Language Models (LLMs) scale, the question is not just how large they become, but \textit{how much of their capacity is effectively utilized}. Existing scaling laws relate model size to loss, yet overlook how components exploit their latent space. In this work, we focus on Feed-Forward Networks (FFNs) and recast width selection as a spectral utilization optimization problem. Using a lightweight diagnostic suite: Hard Rank (participation ratio), Soft Rank (Shannon Rank), Spectral Concentration, and the composite Spectral Utilization Index (SUI), we quantify how many latent directions are meaningfully activated across LLaMA, GPT-2, and nGPT families. Our \textit{key finding} is an \textbf{Asymmetric Spectral Scaling Law}: soft rank follows an almost perfect power law with FFN width, while hard rank grows only sublinearly, with high variance. This asymmetry suggests that widening FFNs mostly adds low-energy tail directions, while dominant-mode subspaces saturate early. Moreover, at larger widths, variance further collapses into a narrow subspace, leaving much of the latent space under-utilized. These results recast FFN width selection as a principled trade-off between tail capacity and dominant-mode capacity, offering concrete guidance for inference-efficient LLM design."
}Markdown (Informal)
[Spectral Scaling Laws in Language Models: emphHow Effectively Do Feed-Forward Networks Use Their Latent Space?](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.emnlp-main.1776/) (Jha & Reagen, EMNLP 2025)
ACL