@inproceedings{lam-etal-2024-multi,
title = "Multi-Tiered {C}antonese Word Segmentation",
author = "Lam, Charles and
Lau, Chaak-ming and
Lee, Jackson L.",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1047/",
pages = "11993--12002",
abstract = "Word segmentation for Chinese text data is essential for compiling corpora and any other tasks where the notion of ``word'' is assumed, since Chinese orthography does not have conventional word boundaries as languages such as English do. A perennial issue, however, is that there is no consensus about the definition of ``word'' in Chinese, which makes word segmentation challenging. Recent work in Chinese word segmentation has begun to embrace the idea of multiple word segmentation possibilities. In a similar spirit, this paper focuses on Cantonese, another major Chinese variety. We propose a linguistically motivated, multi-tiered word segmentation system for Cantonese, and release a Cantonese corpus of 150,000 characters word-segmented by this proposal. Our work will be of interest to researchers whose work involves Cantonese corpus data."
}
Markdown (Informal)
[Multi-Tiered Cantonese Word Segmentation](https://preview.aclanthology.org/fix-sig-urls/2024.lrec-main.1047/) (Lam et al., LREC-COLING 2024)
ACL
- Charles Lam, Chaak-ming Lau, and Jackson L. Lee. 2024. Multi-Tiered Cantonese Word Segmentation. In Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024), pages 11993–12002, Torino, Italia. ELRA and ICCL.