@inproceedings{mccann-2020-fugashi,
title = "fugashi, a Tool for Tokenizing {J}apanese in Python",
author = "McCann, Paul",
editor = "Park, Eunjeong L. and
Hagiwara, Masato and
Milajevs, Dmitrijs and
Liu, Nelson F. and
Chauhan, Geeticka and
Tan, Liling",
booktitle = "Proceedings of Second Workshop for NLP Open Source Software (NLP-OSS)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.nlposs-1.7/",
doi = "10.18653/v1/2020.nlposs-1.7",
pages = "44--51",
abstract = "Recent years have seen an increase in the number of large-scale multilingual NLP projects. However, even in such projects, languages with special processing requirements are often excluded. One such language is Japanese. Japanese is written without spaces, tokenization is non-trivial, and while high quality open source tokenizers exist they can be hard to use and lack English documentation. This paper introduces fugashi, a MeCab wrapper for Python, and gives an introduction to tokenizing Japanese."
}
Markdown (Informal)
[fugashi, a Tool for Tokenizing Japanese in Python](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.nlposs-1.7/) (McCann, NLPOSS 2020)
ACL