@inproceedings{hassan-etal-2017-synthetic,
title = "Synthetic Data for Neural Machine Translation of Spoken-Dialects",
author = "Hassan, Hany and
Elaraby, Mostafa and
Tawfik, Ahmed Y.",
booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
month = dec # " 14-15",
year = "2017",
address = "Tokyo, Japan",
publisher = "International Workshop on Spoken Language Translation",
url = "https://aclanthology.org/2017.iwslt-1.12",
pages = "82--89",
abstract = "In this paper, we introduce a novel approach to generate synthetic data for training Neural Machine Translation systems. The proposed approach supports language variants and dialects with very limited parallel training data. This is achieved using a seed data to project words from a closely-related resource-rich language to an under-resourced language variant via word embedding representations. The proposed approach is based on localized embedding projection of distributed representations which utilizes monolingual embeddings and approximate nearest neighbors queries to transform parallel data across language variants. Our approach is language independent and can be used to generate data for any variant of the source language such as slang or spoken dialect or even for a different language that is related to the source language. We report experimental results on Levantine to English translation using Neural Machine Translation. We show that the synthetic data can provide significant improvements over a very large scale system by more than 2.8 Bleu points and it can be used to provide a reliable translation system for a spoken dialect which does not have sufficient parallel data.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="hassan-etal-2017-synthetic">
<titleInfo>
<title>Synthetic Data for Neural Machine Translation of Spoken-Dialects</title>
</titleInfo>
<name type="personal">
<namePart type="given">Hany</namePart>
<namePart type="family">Hassan</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Mostafa</namePart>
<namePart type="family">Elaraby</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Ahmed</namePart>
<namePart type="given">Y</namePart>
<namePart type="family">Tawfik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2017-dec" 14-15"</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 14th International Conference on Spoken Language Translation</title>
</titleInfo>
<originInfo>
<publisher>International Workshop on Spoken Language Translation</publisher>
<place>
<placeTerm type="text">Tokyo, Japan</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>In this paper, we introduce a novel approach to generate synthetic data for training Neural Machine Translation systems. The proposed approach supports language variants and dialects with very limited parallel training data. This is achieved using a seed data to project words from a closely-related resource-rich language to an under-resourced language variant via word embedding representations. The proposed approach is based on localized embedding projection of distributed representations which utilizes monolingual embeddings and approximate nearest neighbors queries to transform parallel data across language variants. Our approach is language independent and can be used to generate data for any variant of the source language such as slang or spoken dialect or even for a different language that is related to the source language. We report experimental results on Levantine to English translation using Neural Machine Translation. We show that the synthetic data can provide significant improvements over a very large scale system by more than 2.8 Bleu points and it can be used to provide a reliable translation system for a spoken dialect which does not have sufficient parallel data.</abstract>
<identifier type="citekey">hassan-etal-2017-synthetic</identifier>
<location>
<url>https://aclanthology.org/2017.iwslt-1.12</url>
</location>
<part>
<date>2017-dec" 14-15"</date>
<extent unit="page">
<start>82</start>
<end>89</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Synthetic Data for Neural Machine Translation of Spoken-Dialects
%A Hassan, Hany
%A Elaraby, Mostafa
%A Tawfik, Ahmed Y.
%S Proceedings of the 14th International Conference on Spoken Language Translation
%D 2017
%8 dec" 14 15"
%I International Workshop on Spoken Language Translation
%C Tokyo, Japan
%F hassan-etal-2017-synthetic
%X In this paper, we introduce a novel approach to generate synthetic data for training Neural Machine Translation systems. The proposed approach supports language variants and dialects with very limited parallel training data. This is achieved using a seed data to project words from a closely-related resource-rich language to an under-resourced language variant via word embedding representations. The proposed approach is based on localized embedding projection of distributed representations which utilizes monolingual embeddings and approximate nearest neighbors queries to transform parallel data across language variants. Our approach is language independent and can be used to generate data for any variant of the source language such as slang or spoken dialect or even for a different language that is related to the source language. We report experimental results on Levantine to English translation using Neural Machine Translation. We show that the synthetic data can provide significant improvements over a very large scale system by more than 2.8 Bleu points and it can be used to provide a reliable translation system for a spoken dialect which does not have sufficient parallel data.
%U https://aclanthology.org/2017.iwslt-1.12
%P 82-89
Markdown (Informal)
[Synthetic Data for Neural Machine Translation of Spoken-Dialects](https://aclanthology.org/2017.iwslt-1.12) (Hassan et al., IWSLT 2017)
ACL