@inproceedings{arnett-etal-2024-bit,
title = "A Bit of a Problem: Measurement Disparities in Dataset Sizes across Languages",
author = "Arnett, Catherine and
Chang, Tyler A. and
Bergen, Benjamin",
editor = "Melero, Maite and
Sakti, Sakriani and
Soria, Claudia",
booktitle = "Proceedings of the 3rd Annual Meeting of the Special Interest Group on Under-resourced Languages @ LREC-COLING 2024",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://aclanthology.org/2024.sigul-1.1",
pages = "1--9",
abstract = "How should text dataset sizes be compared across languages? Even for content-matched (parallel) corpora, UTF-8 encoded text can require a dramatically different number of bytes for different languages. In our work, we define the byte premium between two languages as the ratio of bytes used to encode content-matched text in those languages. We compute byte premiums for 1155 languages, and we use linear regressions to estimate byte premiums for other languages. We release a tool to obtain byte premiums for any two languages, enabling comparisons of dataset sizes across languages for more equitable multilingual model development and data practices.",
}
Markdown (Informal)
[A Bit of a Problem: Measurement Disparities in Dataset Sizes across Languages](https://aclanthology.org/2024.sigul-1.1) (Arnett et al., SIGUL-WS 2024)
ACL