@inproceedings{choi-etal-2026-k,
title = "K-{L}egal{D}e{ID}: A Benchmark Dataset and {KLUEBERT}-{CRF} for De-identification in {K}orean Court Judgments",
author = "Choi, Wooseok and
Kim, Hyungbin and
Chung, Yon Dohn",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Proceedings of the 19th Conference of the {E}uropean Chapter of the {A}ssociation for {C}omputational {L}inguistics (Volume 1: Long Papers)",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.103/",
pages = "2308--2325",
ISBN = "979-8-89176-380-7",
abstract = "The Korean legal system mandates public access to court judgments to ensure judicial transparency. However, this requirement conflicts with privacy protection obligations due to the prevalence of Personally Identifiable Information (PII) in legal documents. To address this challenge, we introduce **K-LegalDeID**, a large-scale benchmark dataset and an efficient KLUEBERT-CRF model for de-identification for Korean court judgments. Our primary contribution is a new large-scale benchmark dataset spanning 39 legal domains, with its quality is validated by a high inter-annotator agreement (IAA) with Fleiss' Kappa of 0.7352. Our results demonstrate that a lightweight KLUEBERT-CRF model, when trained on our dataset, achieves state-of-the-art performance with an entity-level micro F1 score of 0.9923. Our end-to-end framework offers a practical and computationally efficient solution for real-world legal systems."
}Markdown (Informal)
[K-LegalDeID: A Benchmark Dataset and KLUEBERT-CRF for De-identification in Korean Court Judgments](https://preview.aclanthology.org/ingest-eacl/2026.eacl-long.103/) (Choi et al., EACL 2026)
ACL