@inproceedings{rupnik-etal-2023-benchic,
title = "{BENCH}i{\'c}-lang: A Benchmark for Discriminating between {B}osnian, {C}roatian, {M}ontenegrin and {S}erbian",
author = "Rupnik, Peter and
Kuzman, Taja and
Ljube{\v{s}}i{\'c}, Nikola",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, J{\"o}rg and
Zampieri, Marcos},
booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.vardial-1.11/",
doi = "10.18653/v1/2023.vardial-1.11",
pages = "113--120",
abstract = "Automatic discrimination between Bosnian, Croatian, Montenegrin and Serbian is a hard task due to the mutual intelligibility of these South-Slavic languages. In this paper, we introduce the BENCHi{\'c}-lang benchmark for discriminating between these four languages. The benchmark consists of two datasets from different domains - a Twitter and a news dataset - selected with the aim of fostering cross-dataset evaluation of different modelling approaches. We experiment with the baseline SVM models, based on character n-grams, which perform nicely in-dataset, but do not generalize well in cross-dataset experiments. Thus, we introduce another approach, exploiting only web-crawled data and the weak supervision signal coming from the respective country/language top-level domains. The resulting simple Naive Bayes model, based on less than a thousand word features extracted from web data, outperforms the baseline models in the cross-dataset scenario and achieves good levels of generalization across datasets."
}
Markdown (Informal)
[BENCHić-lang: A Benchmark for Discriminating between Bosnian, Croatian, Montenegrin and Serbian](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.vardial-1.11/) (Rupnik et al., VarDial 2023)
ACL