@inproceedings{lara-raval-2026-machine,
title = "From Machine Translation to Image Captioning: Training Vision-Language Models for Indigenous Languages of the {A}mericas",
author = "Lara, Luis and
Raval, Param",
editor = "Mager, Manuel and
Ebrahimi, Abteen and
Bui, Minh Duc and
Pugh, Robert and
Oncevay, Arturo and
Chiruzzo, Luis and
Solano, Rolando Coto and
Rijhwani, Shruti and
Von Der Wense, Katharina",
booktitle = "Proceedings of the Sixth Workshop on {NLP} for Indigenous Languages of the {A}mericas ({A}mericas{NLP})",
month = jul,
year = "2026",
address = "San Diego, California, USA",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.20/",
pages = "224--235",
ISBN = "979-8-89176-415-6",
abstract = "We describe our system for the AmericasNLP 2026 Shared Task on Cultural Image Captioning for Indigenous Languages of the Americas. Our post-training pipeline starts from Aya Vision 32B: the vision-language model is first fine-tuned on machine translation data from prior AmericasNLP shared tasks and then further fine-tuned on the cultural Image Captioning data. This approach uses translation as an intermediate training task, while the final system produces captions directly in the requested Indigenous language rather than translating a Spanish caption afterward. Our experiments show that machine translation fine-tuning is an important initialization step. The resulting fine-tuned vision-language model also shows translation capabilities for the languages considered in this work. In addition, our zero-shot GPT-5.5 submission ranks first in the Maya language track under the official human-evaluation stage."
}Markdown (Informal)
[From Machine Translation to Image Captioning: Training Vision-Language Models for Indigenous Languages of the Americas](https://preview.aclanthology.org/ingest-acl-workshops/2026.americasnlp-6.20/) (Lara & Raval, AmericasNLP 2026)
ACL