@inproceedings{cotik-etal-2026-qomlaqtaqa,
title = "{Q}om{L}{'}aqtaqa: A {Q}om{--}{S}panish Parallel Corpus for Natural Language Processing with Machine Translation Evaluation",
author = "Cotik, Viviana and
Korablev, Aleksei and
C{\'u}neo, Paola and
Laciana, Pablo",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.17/",
pages = "186--202",
ISBN = "979-8-89176-415-6",
abstract = "Qom, a language of the Guaycuruan family, is a low-resource language for NLP and speech processing. We present the first parallel Qom{--}Spanish corpus in a computationally usable format, comprising 33,392 parallel segments, totaling 1,469,905 Qom tokens and 891,344 Spanish tokens. A subset of 2,943 segments excludes Bible-derived content. It includes alignments at different levels: sentences, sentence fragments, and paragraphs, and is compiled from multiple sources, both previously available and newly collected. We also present bidirectional neural machine translation baselines based on NLLB-200, achieving competitive performance in both translation directions on the full dataset, and lower performance on the non-Bible subset. An ablation study shows that training exclusively on biblical data reduces performance on non-biblical text, highlighting the importance of domain diversity in low-resource machine translation."
}