@inproceedings{bexte-zesch-2025-lunch,
title = "Is Lunch Free Yet? Overcoming the Cold-Start Problem in Supervised Content Scoring using Zero-Shot {LLM}-Generated Training Data",
author = "Bexte, Marie and
Zesch, Torsten",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.bea-1.11/",
pages = "144--159",
ISBN = "979-8-89176-270-1",
abstract = "In this work, we assess the potential of using synthetic data to train models for content scoring. We generate a parallel corpus of LLM-generated data for the SRA dataset. In our experiments, we train three different kinds of models (Logistic Regression, BERT, SBERT) with this data, examining their respective ability to bridge between generated training data and student-authored test data. We also explore the effects of generating larger volumes of training data than what is available in the original dataset. Overall, we find that training models from LLM-generated data outperforms zero-shot scoring of the test data with an LLM. Still, the fine-tuned models perform much worse than models trained on the original data, largely because the LLM-generated answers often do not to conform to the desired labels. However, once the data is manually relabeled, competitive models can be trained from it. With a similarity-based scoring approach, the relabeled (larger) amount of synthetic answers consistently yields a model that surpasses performance of training on the (limited) amount of answers available in the original dataset."
}
Markdown (Informal)
[Is Lunch Free Yet? Overcoming the Cold-Start Problem in Supervised Content Scoring using Zero-Shot LLM-Generated Training Data](https://preview.aclanthology.org/landing_page/2025.bea-1.11/) (Bexte & Zesch, BEA 2025)
ACL