@inproceedings{pavlova-2025-multi,
title = "Multi-stage Training of Bilingual Islamic {LLM} for Neural Passage Retrieval",
author = "Pavlova, Vera",
editor = "Sawalha, Majdi and
Yagi, Sane and
Alshargi, Faisal and
AlShdaifat, Abdallah T. and
Elnagar, Ashraf and
Shawar, Bayan Abu and
Abbas, Norhan",
booktitle = "Proceedings of the New Horizons in Computational Linguistics for Religious Texts",
month = jan,
year = "2025",
address = "Abu Dhabi, UAE",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix___bootstrap-utility-classes/2025.clrel-1.4/",
pages = "42--52",
abstract = "This study examines the use of Natural Language Processing (NLP) technology within the Islamic domain, focusing on developing an Islamic neural retrieval model. By leveraging the robust XLM-R base model, the research employs a language reduction technique to create a lightweight bilingual large language model (LLM). Our approach for domain adaptation addresses the unique challenges faced in the Islamic domain, where substantial in-domain corpora exist only in Arabic while limited in other languages, including English. The work utilizes a multi-stage training process for retrieval models, incorporating large retrieval datasets, such as MS MARCO, and smaller, in-domain datasets to improve retrieval performance. Additionally, we have curated an in-domain retrieval dataset in English by employing data augmentation techniques and involving a reliable Islamic source. This approach enhances the domain-specific dataset for retrieval, leading to further performance gains. The findings suggest that combining domain adaptation and a multi-stage training method for the bilingual Islamic neural retrieval model enables it to outperform monolingual models on downstream retrieval tasks."
}Markdown (Informal)
[Multi-stage Training of Bilingual Islamic LLM for Neural Passage Retrieval](https://preview.aclanthology.org/fix___bootstrap-utility-classes/2025.clrel-1.4/) (Pavlova, CLRel 2025)
ACL