@inproceedings{demir-canbaz-2025-validate,
title = "Validate Your Authority: Benchmarking {LLM}s on Multi-Label Precedent Treatment Classification",
author = "Demir, M. Mikail and
Canbaz, M Abdullah",
editor = "Aletras, Nikolaos and
Chalkidis, Ilias and
Barrett, Leslie and
Goanț{\u{a}}, C{\u{a}}t{\u{a}}lina and
Preoțiuc-Pietro, Daniel and
Spanakis, Gerasimos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.nllp-1.13/",
doi = "10.18653/v1/2025.nllp-1.13",
pages = "172--183",
ISBN = "979-8-89176-338-8",
abstract = "Automating the classification of negative treatment in legal precedent is a critical yet nuanced NLP task where misclassification carries significant risk. To address the shortcomings of standard accuracy, this paper introduces a more robust evaluation framework. We benchmark modern Large Language Models on a new, expert-annotated dataset of 239 real-world legal citations and propose a novel Average Severity Error metric to better measure the practical impact of classification errors. Our experiments reveal a performance split: Google{'}s Gemini 2.5 Flash achieved the highest accuracy on a high-level classification task (79.1{\%}), while OpenAI{'}s GPT-5-mini was the top performer on the more complex fine-grained schema (67.7{\%}). This work establishes a crucial baseline, provides a new context-rich dataset, and introduces an evaluation metric tailored to the demands of this complex legal reasoning task."
}Markdown (Informal)
[Validate Your Authority: Benchmarking LLMs on Multi-Label Precedent Treatment Classification](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.nllp-1.13/) (Demir & Canbaz, NLLP 2025)
ACL