@inproceedings{nuha-jatowt-2026-towards,
title = "Towards the First {NLP} Benchmark for {L}adin - an Extremely Low-Resource Language",
author = "Nuha, Ulin and
Jatowt, Adam",
editor = "Demberg, Vera and
Inui, Kentaro and
Marquez, Llu{\'i}s",
booktitle = "Findings of the {A}ssociation for {C}omputational {L}inguistics: {EACL} 2026",
month = mar,
year = "2026",
address = "Rabat, Morocco",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.55/",
pages = "1049--1064",
ISBN = "979-8-89176-386-9",
abstract = "The performance of large language models (LLMs) tends to degrade for extremely low-resource languages, primarily due to the lack of labeled training data. Despite growing interest, the availability of high-quality natural language processing (NLP) datasets for these languages remains limited. This paper addresses such gap by focusing on Ladin, an endangered Romance language, specifically the Val Badia variant. Leveraging a small set of parallel Ladin{--}Italian sentence pairs, we create synthetic datasets for sentiment analysis and question answering by translating monolingual Italian data. To ensure linguistic quality, we apply rigorous filtering and back-translation procedures in our method. We further demonstrate that incorporating these synthetic datasets into machine translation training leads to substantial improvements over existing Italian{--}Ladin translation baselines. Our contributions include sentiment analysis and question answering datasets for Ladin, establishing foundational resources that support broader NLP research and downstream applications for underrepresented languages."
}Markdown (Informal)
[Towards the First NLP Benchmark for Ladin - an Extremely Low-Resource Language](https://preview.aclanthology.org/ingest-eacl/2026.findings-eacl.55/) (Nuha & Jatowt, Findings 2026)
ACL