@inproceedings{marinova-etal-2023-transformer,
title = "Transformer-Based Language Models for {B}ulgarian",
author = "Marinova, Iva and
Simov, Kiril and
Osenova, Petya",
editor = "Mitkov, Ruslan and
Angelova, Galia",
booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing",
month = sep,
year = "2023",
address = "Varna, Bulgaria",
publisher = "INCOMA Ltd., Shoumen, Bulgaria",
url = "https://preview.aclanthology.org/fix-sig-urls/2023.ranlp-1.77/",
pages = "712--720",
abstract = "This paper presents an approach for training lightweight and robust language models for Bulgarian that mitigate gender, political, racial, and other biases in the data. Our method involves scraping content from major Bulgarian online media providers using a specialized procedure for source filtering, topic selection, and lexicon-based removal of inappropriate language during the pre-training phase. We continuously improve the models by incorporating new data from various domains, including social media, books, scientific literature, and linguistically modified corpora. Our motivation is to provide a solution that is sufficient for all natural language processing tasks in Bulgarian, and to address the lack of existing procedures for guaranteeing the robustness of such models."
}
Markdown (Informal)
[Transformer-Based Language Models for Bulgarian](https://preview.aclanthology.org/fix-sig-urls/2023.ranlp-1.77/) (Marinova et al., RANLP 2023)
ACL
- Iva Marinova, Kiril Simov, and Petya Osenova. 2023. Transformer-Based Language Models for Bulgarian. In Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing, pages 712–720, Varna, Bulgaria. INCOMA Ltd., Shoumen, Bulgaria.