@inproceedings{hopton-etal-2025-functional,
title = "Functional Lexicon in Subword Tokenization",
author = "Hopton, Zachary William and
Scherrer, Yves and
Samardzic, Tanja",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.398/",
pages = "7839--7853",
ISBN = "979-8-89176-189-6",
abstract = "The distinction between function and content units of the lexicon has been somewhat neglected in recent NLP work, but it could still be useful when working with low-resource languages, and, in particular, to improve cross-lingual transfer. In this paper, we investigate to what extent BPE subword tokenization can be used to identify units of the functional lexicon in a language without any annotated data. We analyze subword tokens in terms of their productivity and attempt to find thresholds that best distinguish function from content tokens. On a sample of seven diverse languages, we find that the best results are obtained with 50 BPE merges. We also show that this subword tokenization setting can be beneficial for the interlinear glossing task."
}
Markdown (Informal)
[Functional Lexicon in Subword Tokenization](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.398/) (Hopton et al., NAACL 2025)
ACL
- Zachary William Hopton, Yves Scherrer, and Tanja Samardzic. 2025. Functional Lexicon in Subword Tokenization. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 7839–7853, Albuquerque, New Mexico. Association for Computational Linguistics.