@inproceedings{aker-etal-2012-light,
title = "A light way to collect comparable corpora from the Web",
author = "Aker, Ahmet and
Kanoulas, Evangelos and
Gaizauskas, Robert",
booktitle = "Proceedings of the Eighth International Conference on Language Resources and Evaluation ({LREC}'12)",
month = may,
year = "2012",
address = "Istanbul, Turkey",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2012/pdf/626_Paper.pdf",
pages = "15--20",
abstract = "Statistical Machine Translation (SMT) relies on the availability of rich parallel corpora. However, in the case of under-resourced languages, parallel corpora are not readily available. To overcome this problem previous work has recognized the potential of using comparable corpora as training data. The process of obtaining such data usually involves (1) downloading a separate list of documents for each language, (2) matching the documents between two languages usually by comparing the document contents, and finally (3) extracting useful data for SMT from the matched document pairs. This process requires a large amount of time and resources since a huge volume of documents needs to be downloaded to increase the chances of finding good document pairs. In this work we aim to reduce the amount of time and resources spent for tasks 1 and 2. Instead of obtaining full documents we first obtain just titles along with some meta-data such as time and date of publication. Titles can be obtained through Web Search and RSS News feed collections so that download of the full documents is not needed. We show experimentally that titles can be used to approximate the comparison between documents using full document contents.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="aker-etal-2012-light">
<titleInfo>
<title>A light way to collect comparable corpora from the Web</title>
</titleInfo>
<name type="personal">
<namePart type="given">Ahmet</namePart>
<namePart type="family">Aker</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Evangelos</namePart>
<namePart type="family">Kanoulas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Robert</namePart>
<namePart type="family">Gaizauskas</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2012-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Istanbul, Turkey</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Statistical Machine Translation (SMT) relies on the availability of rich parallel corpora. However, in the case of under-resourced languages, parallel corpora are not readily available. To overcome this problem previous work has recognized the potential of using comparable corpora as training data. The process of obtaining such data usually involves (1) downloading a separate list of documents for each language, (2) matching the documents between two languages usually by comparing the document contents, and finally (3) extracting useful data for SMT from the matched document pairs. This process requires a large amount of time and resources since a huge volume of documents needs to be downloaded to increase the chances of finding good document pairs. In this work we aim to reduce the amount of time and resources spent for tasks 1 and 2. Instead of obtaining full documents we first obtain just titles along with some meta-data such as time and date of publication. Titles can be obtained through Web Search and RSS News feed collections so that download of the full documents is not needed. We show experimentally that titles can be used to approximate the comparison between documents using full document contents.</abstract>
<identifier type="citekey">aker-etal-2012-light</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2012/pdf/626_Paper.pdf</url>
</location>
<part>
<date>2012-may</date>
<extent unit="page">
<start>15</start>
<end>20</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T A light way to collect comparable corpora from the Web
%A Aker, Ahmet
%A Kanoulas, Evangelos
%A Gaizauskas, Robert
%S Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC’12)
%D 2012
%8 may
%I European Language Resources Association (ELRA)
%C Istanbul, Turkey
%F aker-etal-2012-light
%X Statistical Machine Translation (SMT) relies on the availability of rich parallel corpora. However, in the case of under-resourced languages, parallel corpora are not readily available. To overcome this problem previous work has recognized the potential of using comparable corpora as training data. The process of obtaining such data usually involves (1) downloading a separate list of documents for each language, (2) matching the documents between two languages usually by comparing the document contents, and finally (3) extracting useful data for SMT from the matched document pairs. This process requires a large amount of time and resources since a huge volume of documents needs to be downloaded to increase the chances of finding good document pairs. In this work we aim to reduce the amount of time and resources spent for tasks 1 and 2. Instead of obtaining full documents we first obtain just titles along with some meta-data such as time and date of publication. Titles can be obtained through Web Search and RSS News feed collections so that download of the full documents is not needed. We show experimentally that titles can be used to approximate the comparison between documents using full document contents.
%U http://www.lrec-conf.org/proceedings/lrec2012/pdf/626_Paper.pdf
%P 15-20
Markdown (Informal)
[A light way to collect comparable corpora from the Web](http://www.lrec-conf.org/proceedings/lrec2012/pdf/626_Paper.pdf) (Aker et al., LREC 2012)
ACL
- Ahmet Aker, Evangelos Kanoulas, and Robert Gaizauskas. 2012. A light way to collect comparable corpora from the Web. In Proceedings of the Eighth International Conference on Language Resources and Evaluation (LREC'12), pages 15–20, Istanbul, Turkey. European Language Resources Association (ELRA).