@inproceedings{luger-etal-2025-building,
title = "Building Data Infrastructure for Low-Resource Languages",
author = "Luger, Sarah K. K. and
Mosquera, Rafael and
Ortiz Suarez, Pedro",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Washington, Jonathan and
Oco, Nathaniel and
Zhao, Xiaobing",
booktitle = "Proceedings of the Eighth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2025)",
month = may,
year = "2025",
address = "Albuquerque, New Mexico, U.S.A.",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.loresmt-1.14/",
pages = "154--160",
ISBN = "979-8-89176-230-5",
abstract = "The MLCommons Datasets Working Group presents a comprehensive initiative to advance the development and accessibility of artificial intelligence (AI) training and testing resources. This paper introduces three key projects aimed at addressing critical gaps in the AI data ecosystem: the Unsupervised People{'}s Speech Dataset, containing over 821,000 hours of speech across 89+ languages; a strategic collaboration with Common Crawl to enhance web crawling capabilities for low-resource languages; and a framework for knowledge graph extraction evaluation. By focusing on languages other than English (LOTE) and creating permissively licensed, high-quality datasets, these initiatives aim to democratize AI development and improve model performance across diverse linguistic contexts. This work represents a significant step toward more inclusive and capable AI systems that can serve global communities."
}
Markdown (Informal)
[Building Data Infrastructure for Low-Resource Languages](https://preview.aclanthology.org/fix-sig-urls/2025.loresmt-1.14/) (Luger et al., LoResMT 2025)
ACL
- Sarah K. K. Luger, Rafael Mosquera, and Pedro Ortiz Suarez. 2025. Building Data Infrastructure for Low-Resource Languages. In Proceedings of the Eighth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2025), pages 154–160, Albuquerque, New Mexico, U.S.A.. Association for Computational Linguistics.