@inproceedings{saleva-palen-michel-2024-brandeis,
title = "{B}randeis at {V}ar{D}ial 2024 {DSL}-{ML} Shared Task: Multilingual Models, Simple Baselines and Data Augmentation",
author = {S{\"a}lev{\"a}, Jonne and
Palen-Michel, Chester},
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Zampieri, Marcos and
Nakov, Preslav and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.vardial-1.22/",
doi = "10.18653/v1/2024.vardial-1.22",
pages = "241--251",
abstract = "This paper describes the Brandeis University submission to VarDial 2024 DSL-ML Shared Task on multilabel classification for discriminating between similar languages. Our submission consists of three entries per language to the closed track, where no additional data was permitted. Our approach involves a set of simple non-neural baselines using logistic regression, random forests and support vector machines. We follow this by experimenting with finetuning multilingual BERT, either on a single language or all the languages concatenated together.In addition to benchmarking the model architectures against one another on the development set, we perform extensive hyperparameter tuning, which is afforded by the small size of the training data.Our experiments on the development set suggest that finetuned mBERT systems significantly benefit most languages compared to the baseline.However, on the test set, our results indicate that simple models based on scikit-learn can perform surprisingly well and even outperform pretrained language models, as we see with BCMS.Our submissions achieve the best performance on all languages as reported by the organizers. Except for Spanish and French, our non-neural baseline also ranks in the top 3 for all other languages."
}
Markdown (Informal)
[Brandeis at VarDial 2024 DSL-ML Shared Task: Multilingual Models, Simple Baselines and Data Augmentation](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.vardial-1.22/) (Sälevä & Palen-Michel, VarDial 2024)
ACL