@inproceedings{israeli-etal-2025-million,
title = "The Million Authors Corpus: A Cross-Lingual and Cross-Domain {W}ikipedia Dataset for Authorship Verification",
author = "Israeli, Abraham and
Liu, Shuai and
May, Jonathan and
Jurgens, David",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1335/",
pages = "25997--26017",
ISBN = "979-8-89176-256-5",
abstract = "Authorship verification (AV) is a crucial task for applications like identity verification, plagiarism detection, and AI-generated text identification. However, datasets for training and evaluating AV models are primarily in English and primarily in a single domain. This precludes analysis of AV techniques for generalizability and can cause seemingly valid AV solutions to, in fact, rely on topic-based features rather than actual authorship features. To address this limitation, we introduce the Million Authors Corpus (), a novel dataset encompassing contributions from dozens of languages on Wikipedia. It includes only long and contiguous textual chunks taken from Wikipedia edits and links those texts to their authors. includes 60.08M textual chunks, contributed by 1.29M Wikipedia authors. It enables broad-scale cross-lingual and cross-domain AV evaluation to ensure accurate analysis of model capabilities that are not overly optimistic. We provide baseline evaluations using state-of-the-art AV models as well as information retrieval models that are not AV-specific in order to demonstrate `s unique cross-lingual and cross-domain ablation capabilities."
}
Markdown (Informal)
[The Million Authors Corpus: A Cross-Lingual and Cross-Domain Wikipedia Dataset for Authorship Verification](https://preview.aclanthology.org/ingestion-acl-25/2025.findings-acl.1335/) (Israeli et al., Findings 2025)
ACL