@inproceedings{kilgarriff-etal-2010-corpus,
title = "A Corpus Factory for Many Languages",
author = "Kilgarriff, Adam and
Reddy, Siva and
Pomik{\'a}lek, Jan and
PVS, Avinesh",
editor = "Calzolari, Nicoletta and
Choukri, Khalid and
Maegaard, Bente and
Mariani, Joseph and
Odijk, Jan and
Piperidis, Stelios and
Rosner, Mike and
Tapias, Daniel",
booktitle = "Proceedings of the Seventh International Conference on Language Resources and Evaluation ({LREC}`10)",
month = may,
year = "2010",
address = "Valletta, Malta",
publisher = "European Language Resources Association (ELRA)",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/L10-1044/",
abstract = "For many languages there are no large, general-language corpora available. Until the web, all but the institutions could do little but shake their heads in dismay as corpus-building was long, slow and expensive. But with the advent of the Web it can be highly automated and thereby fast and inexpensive. We have developed a corpus factory where we build large corpora. In this paper we describe the method we use, and how it has worked, and how various problems were solved, for eight languages: Dutch, Hindi, Indonesian, Norwegian, Swedish, Telugu, Thai and Vietnamese. We use the BootCaT method: we take a set of `seed words' for the language from Wikipedia. Then, several hundred times over, we * randomly select three or four of the seed words * send as a query to Google or Yahoo or Bing, which returns a `search hits' page * gather the pages that Google or Yahoo point to and save the text. This forms the corpus, which we then * `clean' (to remove navigation bars, advertisements etc) * remove duplicates * tokenise and (if tools are available) lemmatise and part-of-speech tag * load into our corpus query tool, the Sketch Engine The corpora we have developed are available for use in the Sketch Engine corpus query tool."
}
Markdown (Informal)
[A Corpus Factory for Many Languages](https://preview.aclanthology.org/add-emnlp-2024-awards/L10-1044/) (Kilgarriff et al., LREC 2010)
ACL
- Adam Kilgarriff, Siva Reddy, Jan Pomikálek, and Avinesh PVS. 2010. A Corpus Factory for Many Languages. In Proceedings of the Seventh International Conference on Language Resources and Evaluation (LREC'10), Valletta, Malta. European Language Resources Association (ELRA).