@inproceedings{liu-etal-2025-statistically,
title = "Statistically Optimized {SGNS} Model: Enhancing Word Vector Representation with Global Semantic Weight",
author = "Liu, Yulin and
Feng, Xiong and
Liu, Wanwei and
Minghui, Wu",
editor = "Sun, Maosong and
Duan, Peiyong and
Liu, Zhiyuan and
Xu, Ruifeng and
Sun, Weiwei",
booktitle = "Proceedings of the 24th {C}hina National Conference on Computational Linguistics ({CCL} 2025)",
month = aug,
year = "2025",
address = "Jinan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://preview.aclanthology.org/ingest-ccl/2025.ccl-1.74/",
pages = "972--984",
abstract = "``Addressing the limitations of the Skip-gram with Negative Sampling (SGNS) model related to negative sampling, subsampling, and its fixed context window mechanism, this paper first presents an in-depth statistical analysis of the optimal solution for SGNS matrix factorization,deriving the theoretically optimal distribution for negative sampling. Building upon this analysis, we propose the concept of Global Semantic Weight (GSW), derived from Pointwise Mutual Information (PMI). We integrate GSW with word frequency information to improve the effectiveness of both negative sampling and subsampling. Furthermore, we design dynamic adjustment mechanisms for the context window size and the number of negative samples based on GSW, enabling the model to adaptively capture contextual information commensurate with the semantic importance of the center word. Notably, our optimized model maintains the sametime complexity as the original SGNS implementation. Experimental results demonstrate that our proposed model achieves competitive performance aganist state-of-the-art word embedding models including SGNS, CBOW, and GloVe, across multiple benchmark tasks.Compared with the current mainstream dynamic word vector models, this work emphasizes achieving a balance between efficiency and performance within a static embedding framework, and provides potential supplementation and support for complex models such as LLMs.''"
}Markdown (Informal)
[Statistically Optimized SGNS Model: Enhancing Word Vector Representation with Global Semantic Weight](https://preview.aclanthology.org/ingest-ccl/2025.ccl-1.74/) (Liu et al., CCL 2025)
ACL