@inproceedings{ke-marteau-2014-co,
title = "Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora",
author = "Ke, Guiyao and
Marteau, Pierre-Francois",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}'14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "http://www.lrec-conf.org/proceedings/lrec2014/pdf/88_Paper.pdf",
pages = "1992--1999",
abstract = "We address in this paper the assisted construction of bilingual thematic comparable corpora by means of co-clustering bilingual documents collected from raw sources such as the Web. The proposed approach is based on a quantitative comparability measure and a co-clustering approach which allow to mix similarity measures existing in each of the two linguistic spaces with a {``}thematic{''} comparability measure that defines a mapping between these two spaces. With the improvement of the co-clustering ({\$}k{\$}-medoids) performance we get, we use a comparability threshold and a manual verification to ensure the good and robust alignment of co-clusters (co-medoids). Finally, from any available raw corpus, we enrich the aligned clusters in order to provide {``}thematic{''} comparable corpora of good quality and controlled size. On a case study that exploit raw web data, we show that this approach scales reasonably well and is quite suited for the construction of thematic comparable corpora of good quality.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="ke-marteau-2014-co">
<titleInfo>
<title>Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora</title>
</titleInfo>
<name type="personal">
<namePart type="given">Guiyao</namePart>
<namePart type="family">Ke</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Pierre-Francois</namePart>
<namePart type="family">Marteau</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2014-may</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)</title>
</titleInfo>
<originInfo>
<publisher>European Language Resources Association (ELRA)</publisher>
<place>
<placeTerm type="text">Reykjavik, Iceland</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We address in this paper the assisted construction of bilingual thematic comparable corpora by means of co-clustering bilingual documents collected from raw sources such as the Web. The proposed approach is based on a quantitative comparability measure and a co-clustering approach which allow to mix similarity measures existing in each of the two linguistic spaces with a “thematic” comparability measure that defines a mapping between these two spaces. With the improvement of the co-clustering ($k$-medoids) performance we get, we use a comparability threshold and a manual verification to ensure the good and robust alignment of co-clusters (co-medoids). Finally, from any available raw corpus, we enrich the aligned clusters in order to provide “thematic” comparable corpora of good quality and controlled size. On a case study that exploit raw web data, we show that this approach scales reasonably well and is quite suited for the construction of thematic comparable corpora of good quality.</abstract>
<identifier type="citekey">ke-marteau-2014-co</identifier>
<location>
<url>http://www.lrec-conf.org/proceedings/lrec2014/pdf/88_Paper.pdf</url>
</location>
<part>
<date>2014-may</date>
<extent unit="page">
<start>1992</start>
<end>1999</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora
%A Ke, Guiyao
%A Marteau, Pierre-Francois
%S Proceedings of the Ninth International Conference on Language Resources and Evaluation (LREC’14)
%D 2014
%8 may
%I European Language Resources Association (ELRA)
%C Reykjavik, Iceland
%F ke-marteau-2014-co
%X We address in this paper the assisted construction of bilingual thematic comparable corpora by means of co-clustering bilingual documents collected from raw sources such as the Web. The proposed approach is based on a quantitative comparability measure and a co-clustering approach which allow to mix similarity measures existing in each of the two linguistic spaces with a “thematic” comparability measure that defines a mapping between these two spaces. With the improvement of the co-clustering ($k$-medoids) performance we get, we use a comparability threshold and a manual verification to ensure the good and robust alignment of co-clusters (co-medoids). Finally, from any available raw corpus, we enrich the aligned clusters in order to provide “thematic” comparable corpora of good quality and controlled size. On a case study that exploit raw web data, we show that this approach scales reasonably well and is quite suited for the construction of thematic comparable corpora of good quality.
%U http://www.lrec-conf.org/proceedings/lrec2014/pdf/88_Paper.pdf
%P 1992-1999
Markdown (Informal)
[Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora](http://www.lrec-conf.org/proceedings/lrec2014/pdf/88_Paper.pdf) (Ke & Marteau, LREC 2014)
ACL