@inproceedings{kuparinen-2023-murreviikko,
title = "Murreviikko - A Dialectologically Annotated and Normalized Dataset of {F}innish Tweets",
author = "Kuparinen, Olli",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Nakov, Preslav and
Tiedemann, J{\"o}rg and
Zampieri, Marcos},
booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.vardial-1.3/",
doi = "10.18653/v1/2023.vardial-1.3",
pages = "31--39",
abstract = "This paper presents Murreviikko, a dataset of dialectal Finnish tweets which have been dialectologically annotated and manually normalized to a standard form. The dataset can be used as a test set for dialect identification and dialect-to-standard normalization, for instance. We evaluate the dataset on the normalization task, comparing an existing normalization model built on a spoken dialect corpus and three newly trained models with different architectures. We find that there are significant differences in normalization difficulty between the dialects, and that a character-level statistical machine translation model performs best on the Murreviikko tweet dataset."
}
Markdown (Informal)
[Murreviikko - A Dialectologically Annotated and Normalized Dataset of Finnish Tweets](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.vardial-1.3/) (Kuparinen, VarDial 2023)
ACL