@inproceedings{nguyen-etal-2021-data,
title = "Data Augmentation by Concatenation for Low-Resource Translation: A Mystery and a Solution",
author = "Nguyen, Toan Q. and
Murray, Kenton and
Chiang, David",
editor = "Federico, Marcello and
Waibel, Alex and
Costa-juss{\`a}, Marta R. and
Niehues, Jan and
Stuker, Sebastian and
Salesky, Elizabeth",
booktitle = "Proceedings of the 18th International Conference on Spoken Language Translation (IWSLT 2021)",
month = aug,
year = "2021",
address = "Bangkok, Thailand (online)",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2021.iwslt-1.33/",
doi = "10.18653/v1/2021.iwslt-1.33",
pages = "287--293",
abstract = "In this paper, we investigate the driving factors behind concatenation, a simple but effective data augmentation method for low-resource neural machine translation. Our experiments suggest that discourse context is unlikely the cause for concatenation improving BLEU by about +1 across four language pairs. Instead, we demonstrate that the improvement comes from three other factors unrelated to discourse: context diversity, length diversity, and (to a lesser extent) position shifting."
}
Markdown (Informal)
[Data Augmentation by Concatenation for Low-Resource Translation: A Mystery and a Solution](https://preview.aclanthology.org/fix-sig-urls/2021.iwslt-1.33/) (Nguyen et al., IWSLT 2021)
ACL