@inproceedings{fan-sun-2023-constructivist,
title = "Constructivist Tokenization for {E}nglish",
author = "Fan, Allison and
Sun, Weiwei",
editor = "Bonial, Claire and
Tayyar Madabushi, Harish",
booktitle = "Proceedings of the First International Workshop on Construction Grammars and NLP (CxGs+NLP, GURT/SyntaxFest 2023)",
month = mar,
year = "2023",
address = "Washington, D.C.",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.cxgsnlp-1.5/",
pages = "36--40",
abstract = "This paper revisits tokenization from a theoretical perspective, and argues for the necessity of a constructivist approach to tokenization for semantic parsing and modeling language acquisition. We consider two problems: (1) (semi-) automatically converting existing lexicalist annotations, e.g. those of the Penn TreeBank, into constructivist annotations, and (2) automatic tokenization of raw texts. We demonstrate that (1) a heuristic rule-based constructivist tokenizer is able to yield relatively satisfactory accuracy when gold standard Penn TreeBank part-of-speech tags are available, but that some manual annotations are still necessary to obtain gold standard results, and (2) a neural tokenizer is able to provide accurate automatic constructivist tokenization results from raw character sequences. Our research output also includes a set of high-quality morpheme-tokenized corpora, which enable the training of computational models that more closely align with language comprehension and acquisition."
}
Markdown (Informal)
[Constructivist Tokenization for English](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.cxgsnlp-1.5/) (Fan & Sun, CxGsNLP-SyntaxFest 2023)
ACL
- Allison Fan and Weiwei Sun. 2023. Constructivist Tokenization for English. In Proceedings of the First International Workshop on Construction Grammars and NLP (CxGs+NLP, GURT/SyntaxFest 2023), pages 36–40, Washington, D.C.. Association for Computational Linguistics.