@inproceedings{schmalz-tack-2025-gptzeros,
title = "Can {GPTZ}ero{'}s {AI} Vocabulary Distinguish Between {LLM}-Generated and Student-Written Essays?",
author = {Schmalz, Veronica and
Tack, Ana{\"i}s},
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.71/",
pages = "937--952",
ISBN = "979-8-89176-270-1",
abstract = "Despite recent advances in AI detection methods, their practical application, especially in education, remains limited. Educators need functional tools pointing to AI indicators within texts, rather than merely estimating whether AI was used. GPTZero{'}s new AI Vocabulary feature, which highlights parts of a text likely to be AI-generated based on frequent words and phrases from LLM-generated texts, offers a potential solution. However, its effectiveness has not yet been empirically validated.In this study, we examine whether GPTZero{'}s AI Vocabulary can effectively distinguish between LLM-generated and student-written essays. We analyze the AI Vocabulary lists published from October 2024 to March 2025 and evaluate them on a subset of the Ghostbuster dataset, which includes student and LLM essays. We train multiple Bag-of-Words classifiers using GPTZero{'}s AI Vocabulary terms as features and examine their individual contributions to classification.Our findings show that simply checking for the presence, not the frequency, of specific AI terms yields the best results, particularly with ChatGPT-generated essays. However, performance drops to near-random when applied to Claude-generated essays, indicating that GPTZero{'}s AI Vocabulary may not generalize well to texts generated by LLMs other than ChatGPT. Additionally, all classifiers based on GPTZero{'}s AI Vocabulary significantly underperform compared to Bag-of-Words classifiers trained directly on the full dataset vocabulary. These findings suggest that fixed vocabularies based solely on lexical features, despite their interpretability, have limited effectiveness across different LLMs and educational writing contexts."
}
Markdown (Informal)
[Can GPTZero’s AI Vocabulary Distinguish Between LLM-Generated and Student-Written Essays?](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.71/) (Schmalz & Tack, BEA 2025)
ACL