@inproceedings{tseng-etal-2025-lawtoken,
title = "{L}aw{T}oken: a single token worth more than its constituents",
author = "Tseng, Yu-Hsiang and
Chou, Hsin-Yu and
Hsieh, Shu-Kai",
editor = "Boleda, Gemma and
Roth, Michael",
booktitle = "Proceedings of the 29th Conference on Computational Natural Language Learning",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.conll-1.3/",
pages = "30--46",
ISBN = "979-8-89176-271-8",
abstract = "Legal citations require correctly recalling the law references of complex law article names and article numbering, which large language models typically treat as multi-token sequences. Motivated by the form-meaning pair of constructionist approaches, we explore treating these multi-token law references as a single holistic law token and examining the implications for legal citation accuracy and differences in model interpretability. We train and compare two types of models: LawToken models, which encode the legal citations as a single law token, and LawBase models, which treat them as multi-token compounds. The results show that LawToken models outperform LawBase models on legal citation tasks, primarily due to fewer errors in the article numbering components. Further model representation analysis reveals that, while both models achieve comparable semantic representation quality, the multi-token-based LawBase suffers from degraded representations in multistep decoding, leading to more errors. Taken together, these findings suggest that form-meaning pairing can operate in a larger context, and this larger unit may offer advantages in future modeling of legal reasoning. In practice, this approach can significantly reduce the likelihood of hallucinations by anchoring legal citations as discrete, holistic tokens, thereby minimizing the risk of generating nonexistent or incorrect legal references."
}
Markdown (Informal)
[LawToken: a single token worth more than its constituents](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.conll-1.3/) (Tseng et al., CoNLL 2025)
ACL