@inproceedings{poh-etal-2025-say,
title = "What did you say? Generating Child-Directed Speech Questions to Train {LLM}s",
author = "Poh, Whitney and
Tombolini, Michael and
Barak, Libby",
editor = "Charpentier, Lucas and
Choshen, Leshem and
Cotterell, Ryan and
Gul, Mustafa Omer and
Hu, Michael Y. and
Liu, Jing and
Jumelet, Jaap and
Linzen, Tal and
Mueller, Aaron and
Ross, Candace and
Shah, Raj Sanjay and
Warstadt, Alex and
Wilcox, Ethan Gotlieb and
Williams, Adina",
booktitle = "Proceedings of the First BabyLM Workshop",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.18/",
pages = "237--245",
ISBN = "TODO",
abstract = "Child-Directed Speech (CDS) holds unique linguistic properties that distinguish it from other types of textual corpora. Language models trained using CDS often obtain superior results compared with the same size of different types of data. Several studies have aimed at modifying non-CDS data to mimic its linguistic properties to match the hypothesized advantageous aspects of CDS. Here, we propose to adapt the non-CDS portions of the training data to include questions similar to CDS interaction. We modify the data by adding artificially generated questions to the data and methodically analyzing the change in performance using each modified dataset. Our results show that artificial question generation strongly depends on the properties of the original dataset. While the performance improves for question-related measures, the overall performance is negatively affected as a result of the reduced syntactic diversity."
}Markdown (Informal)
[What did you say? Generating Child-Directed Speech Questions to Train LLMs](https://preview.aclanthology.org/ingest-emnlp/2025.babylm-main.18/) (Poh et al., BabyLM 2025)
ACL