@inproceedings{deoghare-etal-2025-refer,
title = "Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation",
author = "Deoghare, Sourabh and
Kanojia, Diptesh and
Bhattacharyya, Pushpak",
editor = "Rambow, Owen and
Wanner, Leo and
Apidianaki, Marianna and
Al-Khalifa, Hend and
Eugenio, Barbara Di and
Schockaert, Steven",
booktitle = "Proceedings of the 31st International Conference on Computational Linguistics",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.344/",
pages = "5123--5135",
abstract = "A prevalent approach to synthetic APE data generation uses source (src) sentences in a parallel corpus to obtain translations (mt) through an MT system and treats corresponding reference (ref) sentences as post-edits (pe). While effective, due to independence between `mt' and `pe,' these translations do not adequately reflect errors to be corrected by a human post-editor. Thus, we introduce a novel and simple yet effective reference-focused synthetic APE data generation technique that uses `ref' instead of src' sentences to obtain corrupted translations (mt{\_}new). The experimental results across English-German, English-Russian, English-Marathi, English-Hindi, and English-Tamil language pairs demonstrate the superior performance of APE systems trained using the newly generated synthetic data compared to those trained using existing synthetic data. Further, APE models trained using a balanced mix of existing and newly generated synthetic data achieve improvements of 0.37, 0.19, 1.01, 2.42, and 2.60 TER points, respectively. We will release the generated synthetic APE data."
}
Markdown (Informal)
[Refer to the Reference: Reference-focused Synthetic Automatic Post-Editing Data Generation](https://preview.aclanthology.org/fix-sig-urls/2025.coling-main.344/) (Deoghare et al., COLING 2025)
ACL