@inproceedings{essfors-baumann-2026-wikilingdiv-dataset,
title = "{W}iki{L}ing{D}iv: a dataset for quantifying digital linguistic diversity using {W}ikipedia page views",
author = "Essfors, Hannes and
Baumann, Andreas",
editor = "Alves, Diego and
Bizzoni, Yuri and
Degaetano-Ortlieb, Stefania and
Kazantseva, Anna and
Pagel, Janis and
Szpakowicz, Stan",
booktitle = "Proceedings of the 10th Joint {SIGHUM} Workshop on Computational Linguistics for Cultural Heritage, Social Sciences, Humanities and Literature 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.19/",
pages = "202--211",
ISBN = "979-8-89176-373-9",
abstract = "With the conflation of digital and non-digital spaces, and NLP technologies being integrated into an increasing number of aspects of daily life, linguistic diversity cannot be fully understood without considering how language is used online. While existing models of linguistic diversity typically have relied on speaker numbers or language production, the dimension of diversity in language consumption remains comparatively understudied. To facilitate such research, we introduce WikiLingDiv, an openly accessible dataset for quantifying linguistic diversity in online knowledge retrieval using Wikipedia page views. Our dataset is based on yearly page views of 340 language editions of Wikipedia, aggregated across 239 countries and territories over 10 years (2015-2024). Using the dataset, we illustrate spatial and temporal patterns of digital linguistic diversity, suggesting that diversity has both increased and decreased across countries and regions, while highlighting country-specific dynamics in language usage. We release the dataset as an openly available and easily integrable data resource for researchers in computational linguistics, digital humanities, and the broader social sciences, enabling further work on linguistic variation, digital inequality, and the interaction between language use and digital technology."
}Markdown (Informal)
[WikiLingDiv: a dataset for quantifying digital linguistic diversity using Wikipedia page views](https://preview.aclanthology.org/ingest-eacl/2026.latechclfl-1.19/) (Essfors & Baumann, LaTeCH-CLfL 2026)
ACL