@inproceedings{esalati-etal-2024-esposito,
title = "Esposito: An {E}nglish-{P}ersian Scientific Parallel Corpus for Machine Translation",
author = "Esalati, Mersad and
Dousti, Mohammad Javad and
Faili, Heshaam",
editor = "Calzolari, Nicoletta and
Kan, Min-Yen and
Hoste, Veronique and
Lenci, Alessandro and
Sakti, Sakriani and
Xue, Nianwen",
booktitle = "Proceedings of the 2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation (LREC-COLING 2024)",
month = may,
year = "2024",
address = "Torino, Italia",
publisher = "ELRA and ICCL",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.557/",
pages = "6299--6308",
abstract = "Neural machine translation requires large number of parallel sentences along with in-domain parallel data to attain best results. Nevertheless, no scientific parallel corpus for English-Persian language pair is available. In this paper, a parallel corpus called Esposito is introduced, which contains 3.5 million parallel sentences in the scientific domain for English-Persian language pair. In addition, we present a manually validated scientific test set that might serve as a baseline for future studies. We show that a system trained using Esposito along with other publicly available data improves the baseline on average by 7.6 and 8.4 BLEU scores for En-{\ensuremath{>}}Fa and Fa-{\ensuremath{>}}En directions, respectively. Additionally, domain analysis using the 5-gram KenLM model revealed notable distinctions between our parallel corpus and the existing generic parallel corpus. This dataset will be available to the public upon the acceptance of the paper."
}
Markdown (Informal)
[Esposito: An English-Persian Scientific Parallel Corpus for Machine Translation](https://preview.aclanthology.org/add-emnlp-2024-awards/2024.lrec-main.557/) (Esalati et al., LREC-COLING 2024)
ACL