@inproceedings{ion-2012-pexacc,
title = "{PEXACC}: A Parallel Sentence Mining Algorithm from Comparable Corpora",
author = "Ion, Radu",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/382_Paper.pdf",
pages = "2181--2188",
abstract = "Extracting parallel data from comparable corpora in order to enrich existing statistical translation models is an avenue that attracted a lot of research in recent years. There are experiments that convincingly show how parallel data extracted from comparable corpora is able to improve statistical machine translation. Yet, the existing body of research on parallel sentence mining from comparable corpora does not take into account the degree of comparability of the corpus being processed or the computation time it takes to extract parallel sentences from a corpus of a given size. We will show that the performance of a parallel sentence extractor crucially depends on the degree of comparability such that it is more difficult to process a weakly comparable corpus than a strongly comparable corpus. In this paper we describe PEXACC, a distributed (running on multiple CPUs), trainable parallel sentence/phrase extractor from comparable corpora. PEXACC is freely available for download with the ACCURAT Toolkit, a collection of MT-related tools developed in the ACCURAT project.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ion-2012-pexacc">
<titleInfo>
<title>PEXACC: A Parallel Sentence Mining Algorithm from Comparable Corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Radu</namePart>
<namePart type="family">Ion</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Extracting parallel data from comparable corpora in order to enrich existing statistical translation models is an avenue that attracted a lot of research in recent years. There are experiments that convincingly show how parallel data extracted from comparable corpora is able to improve statistical machine translation. Yet, the existing body of research on parallel sentence mining from comparable corpora does not take into account the degree of comparability of the corpus being processed or the computation time it takes to extract parallel sentences from a corpus of a given size. We will show that the performance of a parallel sentence extractor crucially depends on the degree of comparability such that it is more difficult to process a weakly comparable corpus than a strongly comparable corpus. In this paper we describe PEXACC, a distributed (running on multiple CPUs), trainable parallel sentence/phrase extractor from comparable corpora. PEXACC is freely available for download with the ACCURAT Toolkit, a collection of MT-related tools developed in the ACCURAT project.</abstract>
<identifier type="citekey">ion-2012-pexacc</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/382_Paper.pdf</url>
</location>
<part>
<date>2012-may</date>
<extent unit="page">
<start>2181</start>
<end>2188</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T PEXACC: A Parallel Sentence Mining Algorithm from Comparable Corpora
%A Ion, Radu
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 may
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F ion-2012-pexacc
%X Extracting parallel data from comparable corpora in order to enrich existing statistical translation models is an avenue that attracted a lot of research in recent years. There are experiments that convincingly show how parallel data extracted from comparable corpora is able to improve statistical machine translation. Yet, the existing body of research on parallel sentence mining from comparable corpora does not take into account the degree of comparability of the corpus being processed or the computation time it takes to extract parallel sentences from a corpus of a given size. We will show that the performance of a parallel sentence extractor crucially depends on the degree of comparability such that it is more difficult to process a weakly comparable corpus than a strongly comparable corpus. In this paper we describe PEXACC, a distributed (running on multiple CPUs), trainable parallel sentence/phrase extractor from comparable corpora. PEXACC is freely available for download with the ACCURAT Toolkit, a collection of MT-related tools developed in the ACCURAT project.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/382_Paper.pdf
%P 2181-2188
Markdown (Informal)
[PEXACC: A Parallel Sentence Mining Algorithm from Comparable Corpora](http://www.lrec-conf.org/proceedings/lrec2012/pdf/382_Paper.pdf) (Ion, LREC 2012)
ACL