@inproceedings{caswell-etal-2020-language,
title = "Language {ID} in the Wild: Unexpected Challenges on the Path to a Thousand-Language Web Text Corpus",
author = "Caswell, Isaac and
Breiner, Theresa and
van Esch, Daan and
Bapna, Ankur",
editor = "Scott, Donia and
Bel, Nuria and
Zong, Chengqing",
booktitle = "Proceedings of the 28th International Conference on Computational Linguistics",
month = dec,
year = "2020",
address = "Barcelona, Spain (Online)",
publisher = "International Committee on Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.coling-main.579/",
doi = "10.18653/v1/2020.coling-main.579",
pages = "6588--6608",
abstract = "Large text corpora are increasingly important for a wide variety of Natural Language Processing (NLP) tasks, and automatic language identification (LangID) is a core technology needed to collect such datasets in a multilingual context. LangID is largely treated as solved in the literature, with models reported that achieve over 90{\%} average F1 on as many as 1,366 languages. We train LangID models on up to 1,629 languages with comparable quality on held-out test sets, but find that human-judged LangID accuracy for web-crawl text corpora created using these models is only around 5{\%} for many lower-resource languages, suggesting a need for more robust evaluation. Further analysis revealed a variety of error modes, arising from domain mismatch, class imbalance, language similarity, and insufficiently expressive models. We propose two classes of techniques to mitigate these errors: wordlist-based tunable-precision filters (for which we release curated lists in about 500 languages) and transformer-based semi-supervised LangID models, which increase median dataset precision from 5.5{\%} to 71.2{\%}. These techniques enable us to create an initial data set covering 100K or more relatively clean sentences in each of 500+ languages, paving the way towards a 1,000-language web text corpus."
}
Markdown (Informal)
[Language ID in the Wild: Unexpected Challenges on the Path to a Thousand-Language Web Text Corpus](https://preview.aclanthology.org/fix-sig-urls/2020.coling-main.579/) (Caswell et al., COLING 2020)
ACL