@inproceedings{isaeva-etal-2025-combining,
title = "Combining Automated and Manual Data for Effective Downstream Fine-Tuning of Transformers for Low-Resource Language Applications",
author = "Isaeva, Ulyana and
Astafurov, Danil and
Martynov, Nikita",
editor = "Fei, Hao and
Tu, Kewei and
Zhang, Yuhui and
Hu, Xiang and
Han, Wenjuan and
Jia, Zixia and
Zheng, Zilong and
Cao, Yixin and
Zhang, Meishan and
Lu, Wei and
Siddharth, N. and
{\O}vrelid, Lilja and
Xue, Nianwen and
Zhang, Yue",
booktitle = "Proceedings of the 1st Joint Workshop on Large Language Models and Structure Modeling (XLLM 2025)",
month = aug,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.xllm-1.9/",
pages = "86--90",
ISBN = "979-8-89176-286-2",
abstract = "This paper addresses the constraints of down-stream applications of pre-trained language models (PLMs) for low-resource languages. These constraints are pre-train data deficiency preventing a low-resource language from being well represented in a PLM and inaccessibility of high-quality task-specific data annotation that limits task learning. We propose to use automatically labeled texts combined with manually annotated data in a two-stage task fine-tuning approach. The experiments revealed that utilizing such methodology combined with vocabulary adaptation may compensate for the absence of a targeted PLM or the deficiency of manually annotated data. The methodology is validated on the morphological tagging task for the Udmurt language. We publish our best model that achieved 93.25{\%} token accuracy on HuggingFace Hub along with the training code1."
}
Markdown (Informal)
[Combining Automated and Manual Data for Effective Downstream Fine-Tuning of Transformers for Low-Resource Language Applications](https://preview.aclanthology.org/landing_page/2025.xllm-1.9/) (Isaeva et al., XLLM 2025)
ACL