@inproceedings{zavarsky-etal-2005-language,
title = "Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text",
author = "Zavarsky, Pavol and
Mikami, Yoshiki and
Wada, Shota",
booktitle = "Proceedings of Machine Translation Summit X: Posters",
month = sep # " 13-15",
year = "2005",
address = "Phuket, Thailand",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2005.mtsummit-posters.5/",
pages = "354--355",
abstract = "In the paper we present an outline of our approach to identify languages and encoding schemes in extremely large sets of multi-lingual documents. The large sets we are analyzing in our Language Observatory project [1] are formed by dozens of millions of text documents. In the paper we present an approach which allows us to analyze about 250 documents every second (about 20 million documents/day) on a single Linux machine. Using a multithread processing on a cluster of Linux servers we are able to analyze easily more than 100 million documents/day."
}
Markdown (Informal)
[Language and Encoding Scheme Identification of Extremely Large Sets of Multilingual Text](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2005.mtsummit-posters.5/) (Zavarsky et al., MTSummit 2005)
ACL