@inproceedings{ademtew-birbo-2024-age,
title = "{AGE}: {A}mharic, {G}e`ez and {E}nglish Parallel Dataset",
author = "Ademtew, Henok Biadglign and
Birbo, Mikiyas Girma",
editor = "Ojha, Atul Kr. and
Liu, Chao-hong and
Vylomova, Ekaterina and
Pirinen, Flammie and
Abbott, Jade and
Washington, Jonathan and
Oco, Nathaniel and
Malykh, Valentin and
Logacheva, Varvara and
Zhao, Xiaobing",
booktitle = "Proceedings of the Seventh Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2024)",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.loresmt-1.14/",
doi = "10.18653/v1/2024.loresmt-1.14",
pages = "139--145",
abstract = "African languages are not well-represented in Natural Language Processing (NLP). The main reason is a lack of resources for training models. Low-resource languages, such as Amharic and Ge`ez, cannot benefit from modern NLP methods because of the lack of high-quality datasets. This paper presents AGE, an open-source tripartite alignment of Amharic, Ge`ez, and English parallel dataset. Additionally, we introduced a novel, 1,000 Ge`ez-centered sentences sourced from areas such as news and novels. Furthermore, we developed a model from a multilingual pre-trained language model, which brings 12.29 and 30.66 for English-Ge`ez and Ge`ez to English, respectively, and 9.39 and 12.29 for Amharic-Ge`ez and Ge`ez-Amharic respectively."
}
Markdown (Informal)
[AGE: Amharic, Ge’ez and English Parallel Dataset](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.loresmt-1.14/) (Ademtew & Birbo, LoResMT 2024)
ACL
- Henok Biadglign Ademtew and Mikiyas Girma Birbo. 2024. AGE: Amharic, Ge’ez and English Parallel Dataset. In Proceedings of the Seventh Workshop on Technologies for Machine Translation of Low-Resource Languages (LoResMT 2024), pages 139–145, Bangkok, Thailand. Association for Computational Linguistics.