@inproceedings{mirkin-besacier-2014-data,
title = "Data selection for compact adapted {SMT} models",
author = "Mirkin, Shachar and
Besacier, Laurent",
booktitle = "Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track",
month = oct # " 22-26",
year = "2014",
address = "Vancouver, Canada",
publisher = "Association for Machine Translation in the Americas",
url = "https://aclanthology.org/2014.amta-researchers.23",
pages = "301--314",
abstract = "Data selection is a common technique for adapting statistical translation models for a specific domain, which has been shown to both improve translation quality and to reduce model size. Selection relies on some in-domain data, of the same domain of the texts expected to be translated. Selecting the sentence-pairs that are most similar to the in-domain data from a pool of parallel texts has been shown to be effective; yet, this approach holds the risk of resulting in a limited coverage, when necessary n-grams that do appear in the pool are less similar to in-domain data that is available in advance. Some methods select additional data based on the actual text that needs to be translated. While useful, this is not always a practical scenario. In this work we describe an extensive exploration of data selection techniques over Arabic to French datasets, and propose methods to address both similarity and coverage considerations while maintaining a limited model size.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="mirkin-besacier-2014-data">
<titleInfo>
<title>Data selection for compact adapted SMT models</title>
</titleInfo>
<name type="personal">
<namePart type="given">Shachar</namePart>
<namePart type="family">Mirkin</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Laurent</namePart>
<namePart type="family">Besacier</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-oct" 22-26"</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track</title>
</titleInfo>
<originInfo>
<publisher>Association for Machine Translation in the Americas</publisher>
<place>
<placeTerm type="text">Vancouver, Canada</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Data selection is a common technique for adapting statistical translation models for a specific domain, which has been shown to both improve translation quality and to reduce model size. Selection relies on some in-domain data, of the same domain of the texts expected to be translated. Selecting the sentence-pairs that are most similar to the in-domain data from a pool of parallel texts has been shown to be effective; yet, this approach holds the risk of resulting in a limited coverage, when necessary n-grams that do appear in the pool are less similar to in-domain data that is available in advance. Some methods select additional data based on the actual text that needs to be translated. While useful, this is not always a practical scenario. In this work we describe an extensive exploration of data selection techniques over Arabic to French datasets, and propose methods to address both similarity and coverage considerations while maintaining a limited model size.</abstract>
<identifier type="citekey">mirkin-besacier-2014-data</identifier>
<location>
<url>https://aclanthology.org/2014.amta-researchers.23</url>
</location>
<part>
<date>2014-oct" 22-26"</date>
<extent unit="page">
<start>301</start>
<end>314</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Data selection for compact adapted SMT models
%A Mirkin, Shachar
%A Besacier, Laurent
%S Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track
%D 2014
%8 oct" 22 26"
%I Association for Machine Translation in the Americas
%C Vancouver, Canada
%F mirkin-besacier-2014-data
%X Data selection is a common technique for adapting statistical translation models for a specific domain, which has been shown to both improve translation quality and to reduce model size. Selection relies on some in-domain data, of the same domain of the texts expected to be translated. Selecting the sentence-pairs that are most similar to the in-domain data from a pool of parallel texts has been shown to be effective; yet, this approach holds the risk of resulting in a limited coverage, when necessary n-grams that do appear in the pool are less similar to in-domain data that is available in advance. Some methods select additional data based on the actual text that needs to be translated. While useful, this is not always a practical scenario. In this work we describe an extensive exploration of data selection techniques over Arabic to French datasets, and propose methods to address both similarity and coverage considerations while maintaining a limited model size.
%U https://aclanthology.org/2014.amta-researchers.23
%P 301-314
Markdown (Informal)
[Data selection for compact adapted SMT models](https://aclanthology.org/2014.amta-researchers.23) (Mirkin & Besacier, AMTA 2014)
ACL
- Shachar Mirkin and Laurent Besacier. 2014. Data selection for compact adapted SMT models. In Proceedings of the 11th Conference of the Association for Machine Translation in the Americas: MT Researchers Track, pages 301–314, Vancouver, Canada. Association for Machine Translation in the Americas.