@inproceedings{codrut-etal-2024-rodia,
title = "{R}o{D}ia: A New Dataset for {R}omanian Dialect Identification from Speech",
author = "Codruț, Rotaru and
Ristea, Nicolae and
Ionescu, Radu",
editor = "Duh, Kevin and
Gomez, Helena and
Bethard, Steven",
booktitle = "Findings of the Association for Computational Linguistics: NAACL 2024",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.findings-naacl.20/",
doi = "10.18653/v1/2024.findings-naacl.20",
pages = "279--286",
abstract = "We introduce RoDia, the first dataset for Romanian dialect identification from speech. The RoDia dataset includes a varied compilation of speech samples from five distinct regions of Romania, covering both urban and rural environments, totaling 2 hours of manually annotated speech data. Along with our dataset, we introduce a set of competitive models to be used as baselines for future research. The top scoring model achieves a macro F1 score of 59.83{\%} and a micro F1 score of 62.08{\%}, indicating that the task is challenging. We thus believe that RoDia is a valuable resource that will stimulate research aiming to address the challenges of Romanian dialect identification. We release our dataset at https://github.com/codrut2/RoDia."
}
Markdown (Informal)
[RoDia: A New Dataset for Romanian Dialect Identification from Speech](https://preview.aclanthology.org/fix-sig-urls/2024.findings-naacl.20/) (Codruț et al., Findings 2024)
ACL