@inproceedings{jeon-etal-2026-unsupervised,
title = "Unsupervised Detection of {LLM}-Generated Text in {K}orean Using Syntactic and Semantic Cues",
author = "Jeon, Heejeong and
Park, MinSu and
Choi, YunSeok and
Park, Eunil",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.77/",
pages = "1504--1518",
ISBN = "979-8-89176-386-9",
abstract = "As Large Language Models (LLMs) are increasingly used for content creation, detecting AI-generated text has become a critical challenge. Prior work has largely focused on English, leaving low-resource languages such as Korean underexplored. We propose an unsupervised detection framework that integrates two complementary signals: syntactic token cohesiveness (TOCSIN) and semantic regeneration similarity (SimLLM). To support evaluation, we construct a Korean pairwise dataset of 1,000 anchors with continuation- and regeneration-style generations and further assess performance across domains (news, research paper abstracts, essays) and model families (GPT-3.5 Turbo, GPT-4o, HyperCLOVA X, LLaMA-3-8B). Without any training, our ensemble achieves up to 0.963 F1 and 0.985 ROC-AUC, outperforming baselines. These results demonstrate that the combination of syntactic and semantic cues enables robust unsupervised detection in low-resource settings. Code available at https://github.com/dxlabskku/llm-detection-main."
}Markdown (Informal)
[Unsupervised Detection of LLM-Generated Text in Korean Using Syntactic and Semantic Cues](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.77/) (Jeon et al., Findings 2026)
ACL