@article{fujii-etal-2017-nonparametric,
title = "Nonparametric {B}ayesian Semi-supervised Word Segmentation",
author = "Fujii, Ryo and
Domoto, Ryo and
Mochihashi, Daichi",
editor = "Lee, Lillian and
Johnson, Mark and
Toutanova, Kristina",
journal = "Transactions of the Association for Computational Linguistics",
volume = "5",
year = "2017",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/Q17-1013/",
doi = "10.1162/tacl_a_00054",
pages = "179--189",
abstract = "This paper presents a novel hybrid generative/discriminative model of word segmentation based on nonparametric Bayesian methods. Unlike ordinary discriminative word segmentation which relies only on labeled data, our semi-supervised model also leverages a huge amounts of unlabeled text to automatically learn new {\textquotedblleft}words{\textquotedblright}, and further constrains them by using a labeled data to segment non-standard texts such as those found in social networking services. Specifically, our hybrid model combines a discriminative classifier (CRF; Lafferty et al. (2001) and unsupervised word segmentation (NPYLM; Mochihashi et al. (2009)), with a transparent exchange of information between these two model structures within the semi-supervised framework (JESS-CM; Suzuki and Isozaki (2008)). We confirmed that it can appropriately segment non-standard texts like those in Twitter and Weibo and has nearly state-of-the-art accuracy on standard datasets in Japanese, Chinese, and Thai."
}
Markdown (Informal)
[Nonparametric Bayesian Semi-supervised Word Segmentation](https://preview.aclanthology.org/Add-Cong-Liu-Florida-Atlantic-University-author-id/Q17-1013/) (Fujii et al., TACL 2017)
ACL