@inproceedings{duderstadt-2025-wikivecs,
title = "Wikivecs: A Fully Reproducible Vectorization of Multilingual {W}ikipedia",
author = "Duderstadt, Brandon",
editor = "Arora, Akhil and
Johnson, Isaac and
Kaffee, Lucie-Aim{\'e}e and
Kuo, Tzu-Sheng and
Piccardi, Tiziano and
Sen, Indira",
booktitle = "Proceedings of the 2nd Workshop on Advancing Natural Language Processing for Wikipedia (WikiNLP 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.wikinlp-1.1/",
pages = "1--9",
ISBN = "979-8-89176-284-8",
abstract = "Dense vector representations have become foundational to modern natural language processing (NLP), powering diverse workflows from semantic search and retrieval augmented generation to content comparison across languages. Although Wikipedia is one of the most comprehensive and widely used datasets in modern NLP research, it lacks a fully reproducible and permissively licensed dense vectorization.In this paper, we present Wikivecs, a fully reproducible, permissively licensed dataset containing dense vector embeddings for every article in Multilingual Wikipedia. Our pipeline leverages a fully reproducible and permissively licensed multilingual text encoder to embed Wikipedia articles into a unified vector space, making it easy to compare and analyze content across languages.Alongside these vectors, we release a two-dimensional data map derived from the vectors, enabling visualization and exploration of Multilingual Wikipedia{'}s content landscape.We demonstrate the utility of our dataset by identifying several content gaps between English and Russian Wikipedia."
}
Markdown (Informal)
[Wikivecs: A Fully Reproducible Vectorization of Multilingual Wikipedia](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.wikinlp-1.1/) (Duderstadt, WikiNLP 2025)
ACL