@inproceedings{kchaou-etal-2020-parallel,
title = "Parallel resources for {T}unisian {A}rabic Dialect Translation",
author = "Kchaou, Sam{\'e}h and
Boujelbane, Rahma and
Hadrich-Belguith, Lamia",
booktitle = "Proceedings of the Fifth Arabic Natural Language Processing Workshop",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.wanlp-1.18",
pages = "200--206",
abstract = "The difficulty of processing dialects is clearly observed in the high cost of building representative corpus, in particular for machine translation. Indeed, all machine translation systems require a huge amount and good management of training data, which represents a challenge in a low-resource setting such as the Tunisian Arabic dialect. In this paper, we present a data augmentation technique to create a parallel corpus for Tunisian Arabic dialect written in social media and standard Arabic in order to build a Machine Translation (MT) model. The created corpus was used to build a sentence-based translation model. This model reached a BLEU score of 15.03{\%} on a test set, while it was limited to 13.27{\%} utilizing the corpus without augmentation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="kchaou-etal-2020-parallel">
<titleInfo>
<title>Parallel resources for Tunisian Arabic Dialect Translation</title>
</titleInfo>
<name type="personal">
<namePart type="given">Saméh</namePart>
<namePart type="family">Kchaou</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Rahma</namePart>
<namePart type="family">Boujelbane</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Lamia</namePart>
<namePart type="family">Hadrich-Belguith</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-dec</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Fifth Arabic Natural Language Processing Workshop</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Barcelona, Spain (Online)</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>The difficulty of processing dialects is clearly observed in the high cost of building representative corpus, in particular for machine translation. Indeed, all machine translation systems require a huge amount and good management of training data, which represents a challenge in a low-resource setting such as the Tunisian Arabic dialect. In this paper, we present a data augmentation technique to create a parallel corpus for Tunisian Arabic dialect written in social media and standard Arabic in order to build a Machine Translation (MT) model. The created corpus was used to build a sentence-based translation model. This model reached a BLEU score of 15.03% on a test set, while it was limited to 13.27% utilizing the corpus without augmentation.</abstract>
<identifier type="citekey">kchaou-etal-2020-parallel</identifier>
<location>
<url>https://aclanthology.org/2020.wanlp-1.18</url>
</location>
<part>
<date>2020-dec</date>
<extent unit="page">
<start>200</start>
<end>206</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Parallel resources for Tunisian Arabic Dialect Translation
%A Kchaou, Saméh
%A Boujelbane, Rahma
%A Hadrich-Belguith, Lamia
%S Proceedings of the Fifth Arabic Natural Language Processing Workshop
%D 2020
%8 dec
%I Association for Computational Linguistics
%C Barcelona, Spain (Online)
%F kchaou-etal-2020-parallel
%X The difficulty of processing dialects is clearly observed in the high cost of building representative corpus, in particular for machine translation. Indeed, all machine translation systems require a huge amount and good management of training data, which represents a challenge in a low-resource setting such as the Tunisian Arabic dialect. In this paper, we present a data augmentation technique to create a parallel corpus for Tunisian Arabic dialect written in social media and standard Arabic in order to build a Machine Translation (MT) model. The created corpus was used to build a sentence-based translation model. This model reached a BLEU score of 15.03% on a test set, while it was limited to 13.27% utilizing the corpus without augmentation.
%U https://aclanthology.org/2020.wanlp-1.18
%P 200-206
Markdown (Informal)
[Parallel resources for Tunisian Arabic Dialect Translation](https://aclanthology.org/2020.wanlp-1.18) (Kchaou et al., WANLP 2020)
ACL