@inproceedings{goot-2025-identifying,
title = "Identifying Open Challenges in Language Identification",
author = "Goot, Rob Van Der",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.acl-long.891/",
pages = "18207--18227",
ISBN = "979-8-89176-251-0",
abstract = "Automatic language identification is a core problem of many Natural LanguageProcessing (NLP) pipelines. A wide variety of architectures and benchmarks havebeen proposed with often near-perfect performance. Although previousstudies have focused on certain challenging setups (i.e. cross-domain, shortinputs), a systematic comparison is missing. We propose a benchmark that allows us to test for the effect of input size, training data size, domain, number oflanguages, scripts, and language families on performance. We evaluatefive popular models on this benchmark and identify which open challengesremain for this task as well as which architectures achieve robust performance. Wefind that cross-domain setups are the most challenging (although arguably mostrelevant), and that number of languages, variety in scripts, and variety inlanguage families have only a small impact on performance. We also contributepractical takeaways: training with 1,000 instances per language and a maximuminput length of 100 characters is enough for robust language identification.Based on our findings, we train an accurate (94.41{\%}) multi-domain languageidentification model on 2,034 languages, for which we also provide an analysisof the remaining errors."
}
Markdown (Informal)
[Identifying Open Challenges in Language Identification](https://preview.aclanthology.org/landing_page/2025.acl-long.891/) (Goot, ACL 2025)
ACL
- Rob Van Der Goot. 2025. Identifying Open Challenges in Language Identification. In Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), pages 18207–18227, Vienna, Austria. Association for Computational Linguistics.