@inproceedings{lamar-kaya-2023-measuring,
title = "Measuring the Impact of Data Augmentation Methods for Extremely Low-Resource {NMT}",
author = "Lamar, Annie and
Kaya, Zeyneb",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Abbott, Jade and
Washington, Jonathan and
Oco, Nathaniel and
Malykh, Valentin and
Logacheva, Varvara and
Zhao, Xiaobing",
booktitle = "Proceedings of the Sixth Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2023)",
month = may,
year = "2023",
address = "Dubrovnik, Croatia",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.loresmt-1.8/",
doi = "10.18653/v1/2023.loresmt-1.8",
pages = "101--109",
abstract = "Data augmentation (DA) is a popular strategy to boost performance on neural machine translation tasks. The impact of data augmentation in low-resource environments, particularly for diverse and scarce languages, is understudied. In this paper, we introduce a simple yet novel metric to measure the impact of several different data augmentation strategies. This metric, which we call Data Augmentation Advantage (DAA), quantifies how many true data pairs a synthetic data pair is worth in a particular experimental context. We demonstrate the utility of this metric by training models for several linguistically-varied datasets using the data augmentation methods of back-translation, SwitchOut, and sentence concatenation. In lower-resource tasks, DAA is an especially valuable metric for comparing DA performance as it provides a more effective way to quantify gains when BLEU scores are especially small and results across diverse languages are more divergent and difficult to assess."
}
Markdown (Informal)
[Measuring the Impact of Data Augmentation Methods for Extremely Low-Resource NMT](https://preview.aclanthology.org/fix-sig-urls/2023.loresmt-1.8/) (Lamar & Kaya, LoResMT 2023)
ACL