@article{marie-fujita-2020-synthesizing,
title = "Synthesizing Parallel Data of User-Generated Texts with Zero-Shot Neural Machine Translation",
author = "Marie, Benjamin and
Fujita, Atsushi",
journal = "Transactions of the Association for Computational Linguistics",
volume = "8",
year = "2020",
address = "Cambridge, MA",
publisher = "MIT Press",
url = "https://aclanthology.org/2020.tacl-1.46",
doi = "10.1162/tacl_a_00341",
pages = "710--725",
abstract = "Neural machine translation (NMT) systems are usually trained on clean parallel data. They can perform very well for translating clean in-domain texts. However, as demonstrated by previous work, the translation quality significantly worsens when translating noisy texts, such as user-generated texts (UGT) from online social media. Given the lack of parallel data of UGT that can be used to train or adapt NMT systems, we synthesize parallel data of UGT, exploiting monolingual data of UGT through crosslingual language model pre-training and zero-shot NMT systems. This paper presents two different but complementary approaches: One alters given clean parallel data into UGT-like parallel data whereas the other generates translations from monolingual data of UGT. On the MTNT translation tasks, we show that our synthesized parallel data can lead to better NMT systems for UGT while making them more robust in translating texts from various domains and styles.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="marie-fujita-2020-synthesizing">
<titleInfo>
<title>Synthesizing Parallel Data of User-Generated Texts with Zero-Shot Neural Machine Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Benjamin</namePart>
<namePart type="family">Marie</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Atsushi</namePart>
<namePart type="family">Fujita</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<genre>journal article</genre>
<relatedItem type="host">
<titleInfo>
<title>Transactions of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<issuance>continuing</issuance>
<publisher>MIT Press</publisher>
<place>
<placeTerm type="text">Cambridge, MA</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">periodical</genre>
<genre>academic journal</genre>
</relatedItem>
<abstract>Neural machine translation (NMT) systems are usually trained on clean parallel data. They can perform very well for translating clean in-domain texts. However, as demonstrated by previous work, the translation quality significantly worsens when translating noisy texts, such as user-generated texts (UGT) from online social media. Given the lack of parallel data of UGT that can be used to train or adapt NMT systems, we synthesize parallel data of UGT, exploiting monolingual data of UGT through crosslingual language model pre-training and zero-shot NMT systems. This paper presents two different but complementary approaches: One alters given clean parallel data into UGT-like parallel data whereas the other generates translations from monolingual data of UGT. On the MTNT translation tasks, we show that our synthesized parallel data can lead to better NMT systems for UGT while making them more robust in translating texts from various domains and styles.</abstract>
<identifier type="citekey">marie-fujita-2020-synthesizing</identifier>
<identifier type="doi">10.1162/tacl_a_00341</identifier>
<location>
<url>https://aclanthology.org/2020.tacl-1.46</url>
</location>
<part>
<date>2020</date>
<detail type="volume"><number>8</number></detail>
<extent unit="page">
<start>710</start>
<end>725</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Journal Article
%T Synthesizing Parallel Data of User-Generated Texts with Zero-Shot Neural Machine Translation
%A Marie, Benjamin
%A Fujita, Atsushi
%J Transactions of the Association for Computational Linguistics
%D 2020
%V 8
%I MIT Press
%C Cambridge, MA
%F marie-fujita-2020-synthesizing
%X Neural machine translation (NMT) systems are usually trained on clean parallel data. They can perform very well for translating clean in-domain texts. However, as demonstrated by previous work, the translation quality significantly worsens when translating noisy texts, such as user-generated texts (UGT) from online social media. Given the lack of parallel data of UGT that can be used to train or adapt NMT systems, we synthesize parallel data of UGT, exploiting monolingual data of UGT through crosslingual language model pre-training and zero-shot NMT systems. This paper presents two different but complementary approaches: One alters given clean parallel data into UGT-like parallel data whereas the other generates translations from monolingual data of UGT. On the MTNT translation tasks, we show that our synthesized parallel data can lead to better NMT systems for UGT while making them more robust in translating texts from various domains and styles.
%9 journal article
%R 10.1162/tacl_a_00341
%U https://aclanthology.org/2020.tacl-1.46
%U https://doi.org/10.1162/tacl_a_00341
%P 710-725
Markdown (Informal)
[Synthesizing Parallel Data of User-Generated Texts with Zero-Shot Neural Machine Translation](https://aclanthology.org/2020.tacl-1.46) (Marie & Fujita, TACL 2020)
ACL