@inproceedings{gwozdz-both-2026-wse,
title = "{WSE} Research at {BEA} 2026 Shared Task 2: Multi-Strategy Rubric-Based Short Answer Scoring for {G}erman",
author = "Gwozdz, Jonas and
Both, Andreas",
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.88/",
pages = "1210--1216",
ISBN = "979-8-89176-409-5",
abstract = "We describe the WSE Research system for the BEA 2026 Shared Task 2 on Rubric-based Short Answer Scoring for German. Our system combines rubric-conditioned prompting with TF-IDF exemplar retrieval, LoRA fine-tuning of open-source Qwen models, and prediction aggregation across complementary scorers. The central question is when prompt engineering, parameter-efficient adaptation, and aggregation each help for rubric-based grading. On the ALICE-LP-1.0 trial set, a fine-tuned Qwen2.5-32B reaches QWK 0.769, surpassing the strongest prompted commercial baseline (Gemini 3 Flash, 0.748). On the official test set, the system ranks second on three tracks and third on the remaining one. Overall, the results show that rubric-conditioned fine-tuning is a competitive and cost-effective alternative to commercial APIs for German short answer scoring, while aggregation helps on seen questions but larger single models generalize better to unseen rubrics."
}Markdown (Informal)
[WSE Research at BEA 2026 Shared Task 2: Multi-Strategy Rubric-Based Short Answer Scoring for German](https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.88/) (Gwozdz & Both, BEA 2026)
ACL