@inproceedings{rupnik-etal-2023-benchic,
    title = "{BENCH}i{\'c}-lang: A Benchmark for Discriminating between {B}osnian, {C}roatian, {M}ontenegrin and {S}erbian",
    author = "Rupnik, Peter  and
      Kuzman, Taja  and
      Ljube{\v{s}}i{\'c}, Nikola",
    editor = {Scherrer, Yves  and
      Jauhiainen, Tommi  and
      Ljube{\v{s}}i{\'c}, Nikola  and
      Nakov, Preslav  and
      Tiedemann, J{\"o}rg  and
      Zampieri, Marcos},
    booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",
    month = may,
    year = "2023",
    address = "Dubrovnik, Croatia",
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/ingest-emnlp/2023.vardial-1.11/",
    doi = "10.18653/v1/2023.vardial-1.11",
    pages = "113--120",
    abstract = "Automatic discrimination between Bosnian, Croatian, Montenegrin and Serbian is a hard task due to the mutual intelligibility of these South-Slavic languages. In this paper, we introduce the BENCHi{\'c}-lang benchmark for discriminating between these four languages. The benchmark consists of two datasets from different domains - a Twitter and a news dataset - selected with the aim of fostering cross-dataset evaluation of different modelling approaches. We experiment with the baseline SVM models, based on character n-grams, which perform nicely in-dataset, but do not generalize well in cross-dataset experiments. Thus, we introduce another approach, exploiting only web-crawled data and the weak supervision signal coming from the respective country/language top-level domains. The resulting simple Naive Bayes model, based on less than a thousand word features extracted from web data, outperforms the baseline models in the cross-dataset scenario and achieves good levels of generalization across datasets."
}Markdown (Informal)
[BENCHić-lang: A Benchmark for Discriminating between Bosnian, Croatian, Montenegrin and Serbian](https://preview.aclanthology.org/ingest-emnlp/2023.vardial-1.11/) (Rupnik et al., VarDial 2023)
ACL