@inproceedings{hopton-aepli-2024-modeling,
title = "Modeling Orthographic Variation in {O}ccitan`s Dialects",
author = {Hopton, Zachary and
Aepli, No{\"e}mi},
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Zampieri, Marcos and
Nakov, Preslav and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.vardial-1.6/",
doi = "10.18653/v1/2024.vardial-1.6",
pages = "78--88",
abstract = "Effectively normalizing spellings in textual data poses a considerable challenge, especially for low-resource languages lacking standardized writing systems. In this study, we fine-tuned a multilingual model with data from several Occitan dialects and conducted a series of experiments to assess the model`s representations of these dialects. For evaluation purposes, we compiled a parallel lexicon encompassing four Occitan dialects.Intrinsic evaluations of the model`s embeddings revealed that surface similarity between the dialects strengthened representations. When the model was further fine-tuned for part-of-speech tagging, its performance was robust to dialectical variation, even when trained solely on part-of-speech data from a single dialect. Our findings suggest that large multilingual models minimize the need for spelling normalization during pre-processing."
}
Markdown (Informal)
[Modeling Orthographic Variation in Occitan’s Dialects](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.vardial-1.6/) (Hopton & Aepli, VarDial 2024)
ACL
- Zachary Hopton and Noëmi Aepli. 2024. Modeling Orthographic Variation in Occitan’s Dialects. In Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024), pages 78–88, Mexico City, Mexico. Association for Computational Linguistics.