@inproceedings{okabe-etal-2025-improving,
title = "Improving Parallel Sentence Mining for Low-Resource and Endangered Languages",
author = {Okabe, Shu and
H{\"a}mmerl, Katharina and
Fraser, Alexander},
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Proceedings of the 63rd Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/acl25-workshop-ingestion/2025.acl-short.17/",
pages = "196--205",
ISBN = "979-8-89176-252-7",
abstract = "While parallel sentence mining has been extensively covered for fairly well-resourced languages, pairs involving low-resource languages have received comparatively little attention.To address this gap, we present Belopsem, a benchmark of new datasets for parallel sentence mining on three language pairs where the source side is low-resource and endangered: Occitan-Spanish, Upper Sorbian-German, and Chuvash-Russian. These combinations also reflect varying linguistic similarity within each pair. We compare three language models in an established parallel sentence mining pipeline and apply two types of improvements to one of them, Glot500. We observe better mining quality overall by both applying alignment post-processing with an unsupervised aligner and using a cluster-based isotropy enhancement technique. These findings are crucial for optimising parallel data extraction for low-resource languages in a realistic way."
}
Markdown (Informal)
[Improving Parallel Sentence Mining for Low-Resource and Endangered Languages](https://preview.aclanthology.org/acl25-workshop-ingestion/2025.acl-short.17/) (Okabe et al., ACL 2025)
ACL