@inproceedings{thierauf-etal-2024-automating,
title = "Automating Dataset Production Using Generative Text and Image Models",
author = "Thierauf, Christopher and
Abrams, Mitchell and
Scheutz, Matthias",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.lrec-main.179/",
pages = "1988--1995",
abstract = "Practical and ethical dataset collection remains a challenge blocking many empirical methods in natural language processing, resulting in a lack of benchmarks or data on which to test hypotheses. We propose a solution to some of these areas by presenting a pipeline to reduce the research burden of producing image and text datasets when datasets may not exist. Our approach, with accompanying software tools, involves (1) generating text with LLMs; (2) creating accompanying image vignettes with text{--}to{--}image transformers; and (3) low-cost human validation. Based on existing literature that has struggled with quantitative evaluation (due to difficulty of data collection), we present the creation of 3 relevant datasets, and conduct a user study that demonstrates this approach is able to aid researchers in obtaining previously-challenging datasets. We provide sample data generated with this technique, the source code used to produce it, and discuss applicability and limitations."
}
Markdown (Informal)
[Automating Dataset Production Using Generative Text and Image Models](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.lrec-main.179/) (Thierauf et al., LREC-COLING 2024)
ACL