@inproceedings{zhou-etal-2025-length,
title = "Length-Induced Embedding Collapse in {PLM}-based Models",
author = "Zhou, Yuqi and
Dai, Sunhao and
Cao, Zhanshuo and
Zhang, Xiao and
Xu, Jun",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1396/",
pages = "28767--28791",
ISBN = "979-8-89176-251-0",
abstract = "Text embeddings from PLM-based models enable a wide range of applications, yet their performance often degrades on longer texts. In this paper, we introduce a phenomenon we call \textbf{Length Collapse}, where embeddings of longer texts tend to cluster together. This clustering results in a distributional inconsistency between the embeddings of short and long texts. We further investigate how these differences contribute to the performance decline observed with longer texts across various downstream tasks. Through a rigorous theoretical analysis of the self-attention mechanism, which acts as a low-pass filter in PLM-based models, we demonstrate that as text length increases, the strength of low-pass filtering intensifies, causing embeddings to retain more low-frequency components. As a result, input token features become more similar, leading to clustering and ultimately the collapse of embeddings for longer texts. To address this issue, we propose a simple method, TempScale, which mitigates the Length Collapse phenomenon. By narrowing the gap in low-pass filtering rates between long and short texts, TempScale ensures more consistent embeddings across different text lengths. This approach leads to performance improvements of \textbf{0.94{\%}} on MTEB and \textbf{1.10{\%}} on LongEmbed, which focuses specifically on long-context retrieval, providing strong evidence for the validity of our analysis. The source code is available at blue\url{https://github.com/Yuqi-Zhou/Length_Collapse}."
}
Markdown (Informal)
[Length-Induced Embedding Collapse in PLM-based Models](https://preview.aclanthology.org/ingestion-acl-25/2025.acl-long.1396/) (Zhou et al., ACL 2025)
ACL
- Yuqi Zhou, Sunhao Dai, Zhanshuo Cao, Xiao Zhang, and Jun Xu. 2025. Length-Induced Embedding Collapse in PLM-based Models. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 28767–28791, Vienna, Austria. Association for Computational Linguistics.