@inproceedings{jones-wijaya-2021-majority,
title = "Majority Voting with Bidirectional Pre-translation For Bitext Retrieval",
author = "Jones, Alexander and
Wijaya, Derry Tanti",
editor = "Rapp, Reinhard and
Sharoff, Serge and
Zweigenbaum, Pierre",
booktitle = "Proceedings of the 14th Workshop on Building and Using Comparable Corpora (BUCC 2021)",
month = sep,
year = "2021",
address = "Online (Virtual Mode)",
publisher = "INCOMA Ltd.",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2021.bucc-1.7/",
pages = "46--59",
abstract = "Obtaining high-quality parallel corpora is of paramount importance for training NMT systems. However, as many language pairs lack adequate gold-standard training data, a popular approach has been to mine so-called {\textquotedblleft}pseudo-parallel{\textquotedblright} sentences from paired documents in two languages. In this paper, we outline some drawbacks with current methods that rely on an embedding similarity threshold, and propose a heuristic method in its place. Our method involves translating both halves of a paired corpus before mining, and then performing a majority vote on sentence pairs mined in three ways: after translating documents in language x to language y, after translating language y to x, and using the original documents in languages x and y. We demonstrate success with this novel approach on the Tatoeba similarity search benchmark in 64 low-resource languages, and on NMT in Kazakh and Gujarati. We also uncover the effect of resource-related factors (i.e. how much monolingual/bilingual data is available for a given language) on the optimal choice of bitext mining method, demonstrating that there is currently no one-size-fits-all approach for this task. We make the code and data used in our experiments publicly available."
}
Markdown (Informal)
[Majority Voting with Bidirectional Pre-translation For Bitext Retrieval](https://preview.aclanthology.org/add-emnlp-2024-awards/2021.bucc-1.7/) (Jones & Wijaya, BUCC 2021)
ACL