@inproceedings{kuparinen-etal-2023-dialect,
title = "Dialect-to-Standard Normalization: A Large-Scale Multilingual Evaluation",
author = "Kuparinen, Olli and
Mileti{\'c}, Aleksandra and
Scherrer, Yves",
editor = "Bouamor, Houda and
Pino, Juan and
Bali, Kalika",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2023",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.923/",
doi = "10.18653/v1/2023.findings-emnlp.923",
pages = "13814--13828",
abstract = "Text normalization methods have been commonly applied to historical language or user-generated content, but less often to dialectal transcriptions. In this paper, we introduce dialect-to-standard normalization {--} i.e., mapping phonetic transcriptions from different dialects to the orthographic norm of the standard variety {--} as a distinct sentence-level character transduction task and provide a large-scale analysis of dialect-to-standard normalization methods. To this end, we compile a multilingual dataset covering four languages: Finnish, Norwegian, Swiss German and Slovene. For the two biggest corpora, we provide three different data splits corresponding to different use cases for automatic normalization. We evaluate the most successful sequence-to-sequence model architectures proposed for text normalization tasks using different tokenization approaches and context sizes. We find that a character-level Transformer trained on sliding windows of three words works best for Finnish, Swiss German and Slovene, whereas the pre-trained byT5 model using full sentences obtains the best results for Norwegian. Finally, we perform an error analysis to evaluate the effect of different data splits on model performance."
}
Markdown (Informal)
[Dialect-to-Standard Normalization: A Large-Scale Multilingual Evaluation](https://preview.aclanthology.org/jlcl-multiple-ingestion/2023.findings-emnlp.923/) (Kuparinen et al., Findings 2023)
ACL