@inproceedings{jang-etal-2025-pilot,
title = "{PILOT}-Bench: A Benchmark for Legal Reasoning in the Patent Domain with {IRAC}-Aligned Classification Tasks",
author = "Jang, Yehoon and
Lee, Chaewon and
Min, Hyun-seok and
Choi, Sungchul",
editor = "Aletras, Nikolaos and
Chalkidis, Ilias and
Barrett, Leslie and
Goanț{\u{a}}, C{\u{a}}t{\u{a}}lina and
Preoțiuc-Pietro, Daniel and
Spanakis, Gerasimos",
booktitle = "Proceedings of the Natural Legal Language Processing Workshop 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.nllp-1.17/",
pages = "240--280",
ISBN = "979-8-89176-338-8",
abstract = "The Patent Trial and Appeal Board (PTAB) of the USPTO adjudicates thousands of *ex parte* appeals each year, requiring the integration of technical understanding and legal reasoning. While large language models (LLMs) are increasingly applied in patent and legal practice, their use has remained limited to lightweight tasks, with no established means of systematically evaluating their capacity for structured legal reasoning in the patent domain. To address this gap, we introduce **PILOT-Bench** (**P**atent **I**nva**L**idati**O**n **T**rial Benchmark), a dataset and benchmark that aligns PTAB decisions with USPTO patent data at the case level and formalizes three IRAC-aligned classification tasks: Issue Type, Board Authorities, and Subdecision. We evaluate a diverse set of commercial and open-source LLMs and conduct analyses across multiple perspectives, including input-variation settings, model families, and error tendencies. Notably, on the Issue Type task, commercial models consistently exceed 0.75 in Exact Match, whereas the strongest open-source model (Qwen-8B) achieves performance around 0.56, highlighting the substantial gap in reasoning capabilities. PILOT-Bench establishes a foundation for the systematic evaluation of patent-domain legal reasoning and points toward future directions for improving LLMs through dataset design and model alignment. All data, code, and benchmark resources are available at https://github.com/TeamLab/pilot-bench."
}Markdown (Informal)
[PILOT-Bench: A Benchmark for Legal Reasoning in the Patent Domain with IRAC-Aligned Classification Tasks](https://preview.aclanthology.org/ingest-emnlp/2025.nllp-1.17/) (Jang et al., NLLP 2025)
ACL