@inproceedings{ikeda-etal-2016-japanese,
title = "{J}apanese Text Normalization with Encoder-Decoder Model",
author = "Ikeda, Taishi and
Shindo, Hiroyuki and
Matsumoto, Yuji",
editor = "Han, Bo and
Ritter, Alan and
Derczynski, Leon and
Xu, Wei and
Baldwin, Tim",
booktitle = "Proceedings of the 2nd Workshop on Noisy User-generated Text ({WNUT})",
month = dec,
year = "2016",
address = "Osaka, Japan",
publisher = "The COLING 2016 Organizing Committee",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/W16-3918/",
pages = "129--137",
abstract = "Text normalization is the task of transforming lexical variants to their canonical forms. We model the problem of text normalization as a character-level sequence to sequence learning problem and present a neural encoder-decoder model for solving it. To train the encoder-decoder model, many sentences pairs are generally required. However, Japanese non-standard canonical pairs are scarce in the form of parallel corpora. To address this issue, we propose a method of data augmentation to increase data size by converting existing resources into synthesized non-standard forms using handcrafted rules. We conducted an experiment to demonstrate that the synthesized corpus contributes to stably train an encoder-decoder model and improve the performance of Japanese text normalization."
}
Markdown (Informal)
[Japanese Text Normalization with Encoder-Decoder Model](https://preview.aclanthology.org/add-emnlp-2024-awards/W16-3918/) (Ikeda et al., WNUT 2016)
ACL