@inproceedings{mudryi-laba-2025-benchmark,
title = "From Benchmark to Better Embeddings: Leveraging Synonym Substitution to Enhance Multimodal Models in {U}krainian",
author = "Mudryi, Volodymyr and
Laba, Yurii",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1115/",
doi = "10.18653/v1/2025.findings-emnlp.1115",
pages = "20458--20468",
ISBN = "979-8-89176-335-7",
abstract = "We study the robustness of text{--}image retrieval for Ukrainian under synonym-substitution attacks (SSA). On Multi30K with OpenCLIP, we evaluate two SSA methods: dictionary-based and LLM-based, and find Ukrainian degrades far more than English (e.g., GPT-4o SSA drops HIT@1 from 32.1 $\to$ 10.9 vs. 41.6 $\to$ 30.4). We introduce a Hybrid method that filters dictionary candidates with an LLM to preserve sense and grammar, yielding higher-quality perturbations (Ukrainian HIT@1 16.8 vs. 7.6/10.9). To mitigate this problem, we propose synonym-augmented fine-tuning, injecting one-word substitutions into training; it boosts robustness (Hybrid 28.1, GPT-4o 25.1) without harming original performance. This is the first systematic SSA evaluation for Ukrainian multimodal retrieval and a practical recipe for improving models in low-resource, morphologically rich languages. We release code, prompts, and trained checkpoints at https://github.com/YuriiLaba/UA-B2BE."
}Markdown (Informal)
[From Benchmark to Better Embeddings: Leveraging Synonym Substitution to Enhance Multimodal Models in Ukrainian](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.1115/) (Mudryi & Laba, Findings 2025)
ACL