@inproceedings{uchendu-etal-2021-turingbench-benchmark,
title = "{TURINGBENCH}: A Benchmark Environment for {T}uring Test in the Age of Neural Text Generation",
author = "Uchendu, Adaku and
Ma, Zeyu and
Le, Thai and
Zhang, Rui and
Lee, Dongwon",
editor = "Moens, Marie-Francine and
Huang, Xuanjing and
Specia, Lucia and
Yih, Scott Wen-tau",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2021",
month = nov,
year = "2021",
address = "Punta Cana, Dominican Republic",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2021.findings-emnlp.172/",
doi = "10.18653/v1/2021.findings-emnlp.172",
pages = "2001--2016",
abstract = "Recent progress in generative language models has enabled machines to generate astonishingly realistic texts. While there are many legitimate applications of such models, there is also a rising need to distinguish machine-generated texts from human-written ones (e.g., fake news detection). However, to our best knowledge, there is currently no benchmark environment with datasets and tasks to systematically study the so-called {\textquotedblright}Turing Test{\textquotedblright} problem for neural text generation methods. In this work, we present the TURINGBENCH benchmark environment, which is comprised of (1) a dataset with 200K human- or machine-generated samples across 20 labels Human, GPT-1, GPT-2{\_}small, GPT-2{\_}medium, GPT-2{\_}large,GPT-2{\_}xl, GPT-2{\_}PyTorch, GPT-3, GROVER{\_}base, GROVER{\_}large, GROVER{\_}mega, CTRL, XLM, XLNET{\_}base, XLNET{\_}large, FAIR{\_}wmt19, FAIR{\_}wmt20, TRANSFORMER{\_}XL, PPLM{\_}distil, PPLM{\_}gpt2, (2) two benchmark tasks{--}i.e., Turing Test (TT) and Authorship Attribution (AA), and (3) a website with leaderboards. Our preliminary experimental results using TURINGBENCH show that GPT-3 and FAIR{\_}wmt20 are the current winners, among all language models tested, in generating the most human-like indistinguishable texts with the lowest F1 score by five state-of-the-art TT detection models. The TURINGBENCH is available at: \url{https://turingbench.ist.psu.edu/}"
}