@inproceedings{yu-etal-2025-imagination,
title = "Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation",
author = "Yu, Zhuang and
Sun, Shiliang and
Zhao, Jing and
Song, Tengfei and
Yang, Hao",
editor = "Christodoulopoulos, Christos and
Chakraborty, Tanmoy and
Rose, Carolyn and
Peng, Violet",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2025",
month = nov,
year = "2025",
address = "Suzhou, China",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.579/",
doi = "10.18653/v1/2025.findings-emnlp.579",
pages = "10913--10928",
ISBN = "979-8-89176-335-7",
abstract = "Multimodal Machine Translation (MMT) enhances textual translation through auxiliary inputs such as images, which is particularly effective in resolving linguistic ambiguities. However, visual information often introduces redundancy or noise, potentially impairing translation quality. To address this challenge, we propose a balanced semantic-augmented framework that integrates ``Imagination{``} and ``Contemplation{``} in multimodal understanding. Specifically, we first generate synthetic images from the source text and align them with the authentic images via an optimal transport (OT) loss to enhance visual-semantic consistency. A CLIP-based similarity gating mechanism is introduced to adaptively fuse visual features from both authentic and synthetic images during visual representation learning. To strengthen semantic grounding, a neural machine translation (NMT) branch is incorporated as a regularization signal, and a Kullback-Leibler (KL) divergence is applied between MMT and NMT outputs to mitigate modality mismatch. Furthermore, an image-text contrastive (ITC) loss aligns the final translations with image representations, reinforcing multimodal coherence. Experiments on multiple translation datasets with a diverse set of language pairs demonstrate that our framework outperforms existing baselines, particularly in cases with visually ambiguous or weakly correlated content."
}Markdown (Informal)
[Imagination and Contemplation: A Balanced Framework for Semantic-Augmented Multimodal Machine Translation](https://preview.aclanthology.org/author-page-yu-wang-polytechnic/2025.findings-emnlp.579/) (Yu et al., Findings 2025)
ACL