@inproceedings{parmar-etal-2024-data, title = "Data, Data Everywhere: A Guide for Pretraining Dataset Construction", author = "Parmar, Jupinder and Prabhumoye, Shrimai and Jennings, Joseph and Liu, Bo and Jhunjhunwala, Aastha and Wang, Zhilin and Patwary, Mostofa and Shoeybi, Mohammad and Catanzaro, Bryan", editor = "Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung", booktitle = "Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing", month = nov, year = "2024", address = "Miami, Florida, USA", publisher = "Association for Computational Linguistics", url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.emnlp-main.596/", doi = "10.18653/v1/2024.emnlp-main.596", pages = "10671--10695" }