@inproceedings{okabe-fraser-2025-bilingual,
title = "Bilingual Sentence Mining for Low-Resource Languages: a Case Study on Upper and {L}ower {S}orbian",
author = "Okabe, Shu and
Fraser, Alexander",
editor = "Lachler, Jordan and
Agyapong, Godfred and
Arppe, Antti and
Moeller, Sarah and
Chaudhary, Aditi and
Rijhwani, Shruti and
Rosenblum, Daisy",
booktitle = "Proceedings of the Eight Workshop on the Use of Computational Methods in the Study of Endangered Languages",
month = mar,
year = "2025",
address = "Honolulu, Hawaii, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.2/",
pages = "11--19",
ISBN = "None",
abstract = "Parallel sentence mining is crucial for down- stream tasks such as Machine Translation, especially for low-resource languages, where such resources are scarce. In this context, we apply a pipeline approach with contextual embeddings on two endangered Slavic languages spoken in Germany, Upper and Lower Sorbian, to evaluate mining quality. To this end, we compare off-the-shelf multilingual language models and word encoders pre-trained on Upper Sorbian to understand their impact on sentence mining. Moreover, to filter out irrelevant pairs, we experiment with a post-processing of mined sentences through an unsupervised word aligner based on word embeddings. We observe the usefulness of additional pre-training in Upper Sorbian, which leads to direct improvements when mining the same language but also its related language, Lower Sorbian."
}
Markdown (Informal)
[Bilingual Sentence Mining for Low-Resource Languages: a Case Study on Upper and Lower Sorbian](https://preview.aclanthology.org/Ingest-2025-COMPUTEL/2025.computel-main.2/) (Okabe & Fraser, ComputEL 2025)
ACL