@inproceedings{shpigunov-2026-improving,
title = "Improving Domain-Specific Translation from {E}nglish into {U}krainian with Retrieval-Augmented Generation",
author = "Shpigunov, Anton",
editor = "Romanyshyn, Mariana",
booktitle = "Proceedings of the Fifth {U}krainian Natural Language Processing Conference ({UNLP} 2026)",
month = may,
year = "2026",
address = "Lviv, Ukraine",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.unlp-1.1/",
pages = "1--11",
ISBN = "979-8-89176-359-3",
abstract = "Large language models have demonstrated competence as language translators, including for lower-resourced languages like Ukrainian. However, in specialized or novel domains, translation quality can suffer without adequate lexical and stylistic reference material. We present a retrieval-augmented approach to English-Ukrainian machine translation in a narrow domain: a private legal/military bilingual corpus. In this approach, semantically similar translation units retrieved via vector embeddings are provided as in-context examples to the LLM. We evaluate three open-weight Gemma 3 models, 4B, 12B, and 27B, against Gemini 3 Flash as a baseline across five augmentation conditions, with k values of 0, 3, 5, 10, and 25, on a 2,581-pair index and a 258-pair test set. We find that context augmentation yields statistically significant improvements in both ChrF++ and COMET for all models, with the smallest model{'}s COMET score improving by 0.076 at k = 3. However, smaller models exhibit context saturation: the 4B model{'}s performance peaks at k = 10 and degrades with additional context, losing 9.72 ChrF++ points and 0.007 COMET between k = 10 and k = 25, while larger models continue to benefit."
}Markdown (Informal)
[Improving Domain-Specific Translation from English into Ukrainian with Retrieval-Augmented Generation](https://preview.aclanthology.org/bulk-corrections-2026-07-02/2026.unlp-1.1/) (Shpigunov, UNLP 2026)
ACL