@inproceedings{wang-riddell-2022-cctaa,
title = "{CCTAA}: A Reproducible Corpus for {C}hinese Authorship Attribution Research",
author = "Wang, Haining and
Riddell, Allen",
editor = "Calzolari, Nicoletta and
B{\'e}chet, Fr{\'e}d{\'e}ric and
Blache, Philippe and
Choukri, Khalid and
Cieri, Christopher and
Declerck, Thierry and
Goggi, Sara and
Isahara, Hitoshi and
Maegaard, Bente and
Mariani, Joseph and
Mazo, H{\'e}l{\`e}ne and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference",
month = jun,
year = "2022",
address = "Marseille, France",
publisher = "European Language Resources Association",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.lrec-1.633/",
pages = "5889--5893",
abstract = "Authorship attribution infers the likely author of an unsigned, single-authored document from a pool of candidates. Despite recent advances, a lack of standard, reproducible testbeds for Chinese language documents impedes progress. In this paper, we present the Chinese Cross-Topic Authorship Attribution (CCTAA) corpus. It is the first standard testbed for authorship attribution on contemporary Chinese prose. The cross-topic design and relatively inflexible genre of newswire contribute to an appropriate level of difficulty. It supports reproducible research by using pre-defined data splits. We show that a sequence classifier based on pre-trained Chinese RoBERTa embedding and a support vector machine classifier using function character n-gram frequency features perform below expectations on this task. The code for generating the corpus and reproducing the baselines is freely available at \url{https://codeberg.org/haining/cctaa}."
}
Markdown (Informal)
[CCTAA: A Reproducible Corpus for Chinese Authorship Attribution Research](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.lrec-1.633/) (Wang & Riddell, LREC 2022)
ACL