@inproceedings{pineda-etal-2026-data,
title = "Data Asgardians at {BEA} 2026 Shared Task 1: A Hybrid Transformer{--}Feature Ensemble for {L}1-Aware {E}nglish Vocabulary Difficulty Prediction",
author = "Pineda, Adrian and
Butt, Sabur and
Ceballos Cancino, H{\'e}ctor",
editor = "Kochmar, Ekaterina and
Alhafni, Bashar and
Bann{\`o}, Stefano and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Anais and
Yaneva, Victoria and
Yuan, Zheng",
booktitle = "Proceedings of the 21st Workshop on Innovative Use of {NLP} for Building Educational Applications ({BEA} 2026)",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.bea-1.82/",
pages = "1137--1145",
ISBN = "979-8-89176-409-5",
abstract = "This paper presents our system for the BEA 2026 Shared Task on Vocabulary Difficulty Prediction for English Learners. The task requires predicting psychometrically calibrated GLMM difficulty scores for English vocabulary items across three learner first-language (L1) backgrounds: Spanish (ES), German (DE), and Mandarin Chinese (CN). Our approach studies how hand-crafted linguistic features can complement contextual multilingual transformer representations. We engineer 33 phonological, morphological, semantic, contextual, and cross-lingual features, and evaluate feature-only regressors, Solo transformer models, Hybrid transformer models, and prediction-level ensembling. Our official Closed Track submissions were generated with XLM-RoBERTa-large Solo and Hybrid models, which improved over the official baseline for all three L1 groups, achieving test RMSEs of 1.182 (ES), 1.117 (DE), and 1.006 (CN) with a mean of 1.103. We then conducted a post-submission refinement using mDeBERTa-v3-base components and a Ridge stacking ensemble, which further reduced test RMSE to 1.037 (ES), 0.997 (DE), and 0.913 (CN), with a mean of 0.982, a mean improvement of 0.121 over our best XLM-RoBERTa-large system."
}