@inproceedings{brown-marivate-2025-pula,
title = "Pula: Training Large Language Models for Setswana",
author = "Brown, Nathan and
Marivate, Vukosi",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.338/",
pages = "6634--6656",
ISBN = "979-8-89176-189-6",
abstract = "In this work we present Pula, a suite of bilingual language models proficient in both Setswana and English. Leveraging recent advancements in data availability and efficient fine-tuning, Pula 8B and Pula 14B outperform GPT-4o and Gemini 1.5 Pro on English-Setswana translation tasks and achieve state-of-the-art performance on Setswana reasoning tasks for their size. We release the weights for Pula 1B, 3B, 8B, and 14B as well as training logs and training and evaluation code. Alongside Pula, we release the largest-ever Setswana text corpus, Marothodi, and the first comprehensive Setswana instruction-tuning dataset, Medupi, consisting of reformatted datasets, translated corpora, and synthetic LLM-generated text. To accompany this data, we release the code used for dataset construction, formatting, filtering, and scraping. Last, we release two Setswana LLM-translated benchmarks, MMLU-tsn and GSM8K-tsn, to measure Setswana knowledge and reasoning capabilities."
}
Markdown (Informal)
[Pula: Training Large Language Models for Setswana](https://preview.aclanthology.org/fix-sig-urls/2025.naacl-long.338/) (Brown & Marivate, NAACL 2025)
ACL
- Nathan Brown and Vukosi Marivate. 2025. Pula: Training Large Language Models for Setswana. In Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 1: Long Papers), pages 6634–6656, Albuquerque, New Mexico. Association for Computational Linguistics.