@inproceedings{saunders-etal-2020-inference,
title = "Inference-only sub-character decomposition improves translation of unseen logographic characters",
author = "Saunders, Danielle and
Feely, Weston and
Byrne, Bill",
editor = "Nakazawa, Toshiaki and
Nakayama, Hideki and
Ding, Chenchen and
Dabre, Raj and
Kunchukuttan, Anoop and
Pa, Win Pa and
Bojar, Ond{\v{r}}ej and
Parida, Shantipriya and
Goto, Isao and
Mino, Hidaya and
Manabe, Hiroshi and
Sudoh, Katsuhito and
Kurohashi, Sadao and
Bhattacharyya, Pushpak",
booktitle = "Proceedings of the 7th Workshop on Asian Translation",
month = dec,
year = "2020",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest_wac_2008/2020.wat-1.21/",
doi = "10.18653/v1/2020.wat-1.21",
pages = "170--177",
abstract = "Neural Machine Translation (NMT) on logographic source languages struggles when translating {\textquoteleft}unseen' characters, which never appear in the training data. One possible approach to this problem uses sub-character decomposition for training and test sentences. However, this approach involves complete retraining, and its effectiveness for unseen character translation to non-logographic languages has not been fully explored. We investigate existing ideograph-based sub-character decomposition approaches for Chinese-to-English and Japanese-to-English NMT, for both high-resource and low-resource domains. For each language pair and domain we construct a test set where all source sentences contain at least one unseen logographic character. We find that complete sub-character decomposition often harms unseen character translation, and gives inconsistent results generally. We offer a simple alternative based on decomposition before inference for unseen characters only. Our approach allows flexible application, achieving translation adequacy improvements and requiring no additional models or training."
}
Markdown (Informal)
[Inference-only sub-character decomposition improves translation of unseen logographic characters](https://preview.aclanthology.org/ingest_wac_2008/2020.wat-1.21/) (Saunders et al., WAT 2020)
ACL