@inproceedings{tian-etal-2024-detecting,
title = "Detecting Machine-Generated Long-Form Content with Latent-Space Variables",
author = "Tian, Yufei and
Pan, Zeyu and
Peng, Nanyun",
editor = "Al-Onaizan, Yaser and
Bansal, Mohit and
Chen, Yun-Nung",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2024",
month = nov,
year = "2024",
address = "Miami, Florida, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.608/",
doi = "10.18653/v1/2024.findings-emnlp.608",
pages = "10394--10408",
abstract = "The increasing capability of large language models (LLMs) to generate fluent long-form texts is presenting new challenges in distinguishing these outputs from those of humans. Existing zero-shot detectors that primarily focus on token-level distributions are vulnerable to real-world domain shift including different decoding strategies, variations in prompts, and attacks. We propose a more robust method that incorporates abstract elements{---}such as topic or event transitions{---}as key deciding factors, by training a latent-space model on sequences of events or topics derived from human-written texts. On three different domains, machine generations which are originally inseparable from humans' on the token level can be better distinguished with our latent-space model, leading to a 31{\%} improvement over strong baselines such as DetectGPT. Our analysis further reveals that unlike humans, modern LLMs such as GPT-4 selecting event triggers and transitions differently, and inherent disparity regardless of the generation configurations adopted in real-time."
}
Markdown (Informal)
[Detecting Machine-Generated Long-Form Content with Latent-Space Variables](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.findings-emnlp.608/) (Tian et al., Findings 2024)
ACL