@inproceedings{saunders-etal-2020-inference,
title = "Inference-only sub-character decomposition improves translation of unseen logographic characters",
author = "Saunders, Danielle and
Feely, Weston and
Byrne, Bill",
booktitle = "Proceedings of the 7th Workshop on Asian Translation",
month = dec,
year = "2020",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wat-1.21",
pages = "170--177",
abstract = "Neural Machine Translation (NMT) on logographic source languages struggles when translating {`}unseen{'} characters, which never appear in the training data. One possible approach to this problem uses sub-character decomposition for training and test sentences. However, this approach involves complete retraining, and its effectiveness for unseen character translation to non-logographic languages has not been fully explored. We investigate existing ideograph-based sub-character decomposition approaches for Chinese-to-English and Japanese-to-English NMT, for both high-resource and low-resource domains. For each language pair and domain we construct a test set where all source sentences contain at least one unseen logographic character. We find that complete sub-character decomposition often harms unseen character translation, and gives inconsistent results generally. We offer a simple alternative based on decomposition before inference for unseen characters only. Our approach allows flexible application, achieving translation adequacy improvements and requiring no additional models or training.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="saunders-etal-2020-inference">
<titleInfo>
<title>Inference-only sub-character decomposition improves translation of unseen logographic characters</title>
</titleInfo>
<name type="personal">
<namePart type="given">Danielle</namePart>
<namePart type="family">Saunders</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Weston</namePart>
<namePart type="family">Feely</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Bill</namePart>
<namePart type="family">Byrne</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-dec</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 7th Workshop on Asian Translation</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Suzhou, China</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Neural Machine Translation (NMT) on logographic source languages struggles when translating ‘unseen’ characters, which never appear in the training data. One possible approach to this problem uses sub-character decomposition for training and test sentences. However, this approach involves complete retraining, and its effectiveness for unseen character translation to non-logographic languages has not been fully explored. We investigate existing ideograph-based sub-character decomposition approaches for Chinese-to-English and Japanese-to-English NMT, for both high-resource and low-resource domains. For each language pair and domain we construct a test set where all source sentences contain at least one unseen logographic character. We find that complete sub-character decomposition often harms unseen character translation, and gives inconsistent results generally. We offer a simple alternative based on decomposition before inference for unseen characters only. Our approach allows flexible application, achieving translation adequacy improvements and requiring no additional models or training.</abstract>
<identifier type="citekey">saunders-etal-2020-inference</identifier>
<location>
<url>https://aclanthology.org/2020.wat-1.21</url>
</location>
<part>
<date>2020-dec</date>
<extent unit="page">
<start>170</start>
<end>177</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Inference-only sub-character decomposition improves translation of unseen logographic characters
%A Saunders, Danielle
%A Feely, Weston
%A Byrne, Bill
%S Proceedings of the 7th Workshop on Asian Translation
%D 2020
%8 dec
%I Association for Computational Linguistics
%C Suzhou, China
%F saunders-etal-2020-inference
%X Neural Machine Translation (NMT) on logographic source languages struggles when translating ‘unseen’ characters, which never appear in the training data. One possible approach to this problem uses sub-character decomposition for training and test sentences. However, this approach involves complete retraining, and its effectiveness for unseen character translation to non-logographic languages has not been fully explored. We investigate existing ideograph-based sub-character decomposition approaches for Chinese-to-English and Japanese-to-English NMT, for both high-resource and low-resource domains. For each language pair and domain we construct a test set where all source sentences contain at least one unseen logographic character. We find that complete sub-character decomposition often harms unseen character translation, and gives inconsistent results generally. We offer a simple alternative based on decomposition before inference for unseen characters only. Our approach allows flexible application, achieving translation adequacy improvements and requiring no additional models or training.
%U https://aclanthology.org/2020.wat-1.21
%P 170-177
Markdown (Informal)
[Inference-only sub-character decomposition improves translation of unseen logographic characters](https://aclanthology.org/2020.wat-1.21) (Saunders et al., WAT 2020)
ACL