@inproceedings{tanaka-etal-2020-building,
title = "Building a {J}apanese Typo Dataset from {W}ikipedia`s Revision History",
author = "Tanaka, Yu and
Murawaki, Yugo and
Kawahara, Daisuke and
Kurohashi, Sadao",
editor = "Rijhwani, Shruti and
Liu, Jiangming and
Wang, Yizhong and
Dror, Rotem",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics: Student Research Workshop",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Author-page-Marten-During-lu/2020.acl-srw.31/",
doi = "10.18653/v1/2020.acl-srw.31",
pages = "230--236",
abstract = "User generated texts contain many typos for which correction is necessary for NLP systems to work. Although a large number of typo{--}correction pairs are needed to develop a data-driven typo correction system, no such dataset is available for Japanese. In this paper, we extract over half a million Japanese typo{--}correction pairs from Wikipedia`s revision history. Unlike other languages, Japanese poses unique challenges: (1) Japanese texts are unsegmented so that we cannot simply apply a spelling checker, and (2) the way people inputting kanji logographs results in typos with drastically different surface forms from correct ones. We address them by combining character-based extraction rules, morphological analyzers to guess readings, and various filtering methods. We evaluate the dataset using crowdsourcing and run a baseline seq2seq model for typo correction."
}
Markdown (Informal)
[Building a Japanese Typo Dataset from Wikipedia’s Revision History](https://preview.aclanthology.org/Author-page-Marten-During-lu/2020.acl-srw.31/) (Tanaka et al., ACL 2020)
ACL