@article{eltanbouly-etal-2026-one,
title = "Is One Dataset Enough for Evaluation? Studying Generalizability of Automated Essay Scoring Models",
author = "Eltanbouly, Sohaila and
Sayed, Marwan and
Elsayed, Tamer",
editor = "Piperidis, Stelios and
Bel, N{\'u}ria and
van den Heuvel, Henk and
Ide, Nancy and
Krek, Simon and
Toral, Antonio",
journal = "International Conference on Language Resources and Evaluation",
volume = "main",
month = may,
year = "2026",
address = "Palma de Mallorca, Spain",
publisher = "ELRA Language Resource Association",
url = "https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.29/",
pages = "431--440",
abstract = "Automated Essay Scoring (AES) has made significant advancements in writing assessment. Recently, cross-prompt AES has gained attention because of its focus on generalizing to unseen prompts. Despite the promise of these advancements, a critical question remains: how generalizable and robust are those models when applied to diverse datasets? This study assesses the generalizability of eight cross-prompt AES models across three different datasets. We employ two experimental setups: the within-dataset approach, where both training and testing occur on the same dataset, and the cross-dataset approach, which challenges the models by evaluating their performance on previously unseen datasets. The experimental results show significant performance inconsistencies, highlighting that relying on a single dataset is insufficient for building robust and generalizable AES systems."
}Markdown (Informal)
[Is One Dataset Enough for Evaluation? Studying Generalizability of Automated Essay Scoring Models](https://preview.aclanthology.org/ingest-lrec/2026.lrec-main.29/) (Eltanbouly et al., LREC 2026)
ACL