@inproceedings{chu-etal-2025-unpacking,
title = "Unpacking Legal Reasoning in {LLM}s: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based {NLU} Tasks",
author = "Chu, Yu Ying and
Huang, Sieh-chuen and
Shao, Hsuan-Lei",
editor = "Abzianidze, Lasha and
de Paiva, Valeria",
booktitle = "Proceedings of the 5th Workshop on Natural Logic Meets Machine Learning (NALOMA)",
month = aug,
year = "2025",
address = "Bochum, Germany",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/gwc-25-ingestion/2025.naloma-1.1/",
pages = "1--7",
ISBN = "979-8-89176-287-9",
abstract = "This study evaluates how Large Language Models (LLMs) perform deep legal reasoning on Taiwanese Status Law questions and investigates how Chain-of-Thought (CoT) prompting affects interpretability, alignment, and generalization. Using a two-stage evaluation framework, we first decomposed six real legal essay questions into 68 sub-questions covering issue spotting, statutory application, and inheritance computation. In Stage Two, full-length answers were collected under baseline and CoT-prompted conditions. Four LLMs{---}ChatGPT-4o, Gemini, Grok3, and Copilot{---}were tested. Results show CoT prompting significantly improved accuracy for Gemini (from 83.2{\%} to 94.5{\%}, p {\ensuremath{<}} 0.05) and Grok3, with moderate but consistent gains for ChatGPT and Copilot. Human evaluation of full-length responses revealed CoT answers received notably higher scores in issue coverage and reasoning clarity, with ChatGPT and Gemini gaining +2.67 and +1.92 points respectively. Despite these gains, legal misclassifications persist, highlighting alignment gaps between surface-level fluency and expert legal reasoning. This work opens the black box of legal NLU by tracing LLM reasoning chains, quantifying performance shifts under structured prompting, and providing a diagnostic benchmark for complex, open-ended legal tasks beyond multiple-choice settings."
}
Markdown (Informal)
[Unpacking Legal Reasoning in LLMs: Chain-of-Thought as a Key to Human-Machine Alignment in Essay-Based NLU Tasks](https://preview.aclanthology.org/gwc-25-ingestion/2025.naloma-1.1/) (Chu et al., NALOMA 2025)
ACL