@inproceedings{michael-horbach-2025-germdetect,
title = "{G}erm{D}etect: Verb Placement Error Detection Datasets for Learners of {G}ermanic Languages",
author = "Michael, Noah-Manuel and
Horbach, Andrea",
editor = {Kochmar, Ekaterina and
Alhafni, Bashar and
Bexte, Marie and
Burstein, Jill and
Horbach, Andrea and
Laarmann-Quante, Ronja and
Tack, Ana{\"i}s and
Yaneva, Victoria and
Yuan, Zheng},
booktitle = "Proceedings of the 20th Workshop on Innovative Use of NLP for Building Educational Applications (BEA 2025)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.bea-1.59/",
pages = "818--829",
ISBN = "979-8-89176-270-1",
abstract = "Correct verb placement is difficult to acquire for second-language learners of Germanic languages. However, word order errors and, consequently, verb placement errors, are heavily underrepresented in benchmark datasets of NLP tasks such as grammatical error detection/correction and linguistic acceptability assessment. If they are present, they are most often naively introduced, or classification occurs at the sentence level, preventing the precise identification of individual errors and the provision of appropriate feedback to learners. To remedy this, we present GermDetect: Universal Dependencies-based, linguistically informed verb placement error detection datasets for learners of Germanic languages, designed as a token classification task. As our datasets are UD-based, we are able to provide them in most major Germanic languages: Afrikaans, German, Dutch, Faroese, Icelandic, Danish, Norwegian (Bokm{\r{a}}l and Nynorsk), and Swedish.We train multilingual BERT models on GermDetect and show that linguistically informed, UD-based error induction results in more effective models for verb placement error detection than models trained on naively introduced errors. Finally, we conduct ablation studies on multilingual training and find that lower-resource languages benefit from the inclusion of structurally related languages in training."
}