@inproceedings{faisal-anastasopoulos-2024-data,
title = "Data-Augmentation-Based Dialectal Adaptation for {LLM}s",
author = "Faisal, Fahim and
Anastasopoulos, Antonios",
editor = {Scherrer, Yves and
Jauhiainen, Tommi and
Ljube{\v{s}}i{\'c}, Nikola and
Zampieri, Marcos and
Nakov, Preslav and
Tiedemann, J{\"o}rg},
booktitle = "Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024)",
month = jun,
year = "2024",
address = "Mexico City, Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2024.vardial-1.17/",
doi = "10.18653/v1/2024.vardial-1.17",
pages = "197--208",
abstract = "This report presents gmnlp{'}s participation to the Dialect-Copa shared task at VarDial 2024 (Chifu et al., 2024), which focuses on evaluating the commonsense reasoning capabilities of large language models (LLMs) on South Slavic micro-dialects. The task aims to assess how well LLMs can handle non-standard dialectal varieties, as their performance on standard languages is already well-established. We propose an approach that combines the strengths of different types of language models and leverages data augmentation techniques to improve task performance on three South Slavic dialects: Chakavian, Cherkano, and Torlak. We conduct experiments using a language-family-focused encoder-based model (BERTi{\'c}) and a domain-agnostic multilingual model (AYA-101). Our results demonstrate that the proposed data augmentation techniques lead to substantial performance gains across all three test datasets in the open-source model category. This work highlights the practical utility of data augmentation and the potential of LLMs in handling non-standard dialectal varieties, contributing to the broader goal of advancing natural language understanding in low-resource and dialectal settings."
}
Markdown (Informal)
[Data-Augmentation-Based Dialectal Adaptation for LLMs](https://preview.aclanthology.org/fix-sig-urls/2024.vardial-1.17/) (Faisal & Anastasopoulos, VarDial 2024)
ACL
- Fahim Faisal and Antonios Anastasopoulos. 2024. Data-Augmentation-Based Dialectal Adaptation for LLMs. In Proceedings of the Eleventh Workshop on NLP for Similar Languages, Varieties, and Dialects (VarDial 2024), pages 197–208, Mexico City, Mexico. Association for Computational Linguistics.