@inproceedings{garcia-cerda-etal-2025-building,
title = "Building a Lightweight Classifier to Distinguish Closely Related Language Varieties with Limited Supervision: The Case of {C}atalan vs {V}alencian",
author = "Garc{\'i}a-Cerd{\'a}, Ra{\'u}l and
Mir{\'o} Maestre, Mar{\'i}a and
Canal, Miquel",
editor = "Estevanell-Valladares, Ernesto Luis and
Picazo-Izquierdo, Alicia and
Ranasinghe, Tharindu and
Mikaberidze, Besik and
Ostermann, Simon and
Gurgurov, Daniil and
Mueller, Philipp and
Borg, Claudia and
{\v{S}}imko, Mari{\'a}n",
booktitle = "Proceedings of the First Workshop on Advancing NLP for Low-Resource Languages",
month = sep,
year = "2025",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/corrections-2026-01/2025.lowresnlp-1.2/",
pages = "7--11",
abstract = "Dialectal variation among closely related languages poses a major challenge in low-resource NLP, as their linguistic similarity increases confusability for automatic systems. We introduce the first supervised classifier to distinguish standard Catalan from its regional variety Valencian. Our lightweight approach fine-tunes a RoBERTa-base model on a manually curated corpus of 20 000 sentences{---}without any Valencian-specific tools{---}and achieves 98 {\%} accuracy on unseen test data. In a human evaluation of 90 mixed-variety items per reviewer, acceptance rates reached 96.7 {\%} for Valencian and 97.7 {\%} for Catalan (97.2 {\%} overall). We discuss limitations with out-of-distribution inputs and outline future work on confidence calibration and dialect-aware tokenization. Our findings demonstrate that high-impact dialect classification is feasible with minimal resources."
}Markdown (Informal)
[Building a Lightweight Classifier to Distinguish Closely Related Language Varieties with Limited Supervision: The Case of Catalan vs Valencian](https://preview.aclanthology.org/corrections-2026-01/2025.lowresnlp-1.2/) (García-Cerdá et al., LowResNLP 2025)
ACL