@inproceedings{ke-marteau-2014-co,
title = "Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora",
author = "Ke, Guiyao and
Marteau, Pierre-Francois",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Declerck, Thierry and
Loftsson, Hrafn and
Maegaard, Bente and
Mariani, Joseph and
Moreno, Asuncion and
Odijk, Jan and
Piperidis, Stelios",
booktitle = "Proceedings of the Ninth International Conference on Language Resources and Evaluation ({LREC}`14)",
month = may,
year = "2014",
address = "Reykjavik, Iceland",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/L14-1677/",
pages = "1992--1999",
abstract = "We address in this paper the assisted construction of bilingual thematic comparable corpora by means of co-clustering bilingual documents collected from raw sources such as the Web. The proposed approach is based on a quantitative comparability measure and a co-clustering approach which allow to mix similarity measures existing in each of the two linguistic spaces with a {\textquotedblleft}thematic{\textquotedblright} comparability measure that defines a mapping between these two spaces. With the improvement of the co-clustering ($k$-medoids) performance we get, we use a comparability threshold and a manual verification to ensure the good and robust alignment of co-clusters (co-medoids). Finally, from any available raw corpus, we enrich the aligned clusters in order to provide {\textquotedblleft}thematic{\textquotedblright} comparable corpora of good quality and controlled size. On a case study that exploit raw web data, we show that this approach scales reasonably well and is quite suited for the construction of thematic comparable corpora of good quality."
}
Markdown (Informal)
[Co-clustering of bilingual datasets as a mean for assisting the construction of thematic bilingual comparable corpora](https://preview.aclanthology.org/add-emnlp-2024-awards/L14-1677/) (Ke & Marteau, LREC 2014)
ACL