@inproceedings{zuo-etal-2025-inimagetrans,
title = "{I}n{I}mage{T}rans: Multimodal {LLM}-based Text Image Machine Translation",
author = "Zuo, Fei and
Chen, Kehai and
Zhang, Yu and
Xue, Zhengshan and
Zhang, Min",
editor = "Che, Wanxiang and
Nabende, Joyce and
Shutova, Ekaterina and
Pilehvar, Mohammad Taher",
booktitle = "Findings of the Association for Computational Linguistics: ACL 2025",
month = jul,
year = "2025",
address = "Vienna, Austria",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/landing_page/2025.findings-acl.1039/",
pages = "20256--20277",
ISBN = "979-8-89176-256-5",
abstract = "Multimodal large language models (MLLMs) have shown remarkable capabilities across various downstream tasks. However, when MLLMs are transferred to the text image machine translation (TiMT) task, preliminary experiments reveal that MLLMs suffer from serious repetition and omission hallucinations. To alleviate these issues, this paper first designs an efficient MLLM named InImageTrans for TiMT and then proposes a simple and effective method named multi-conditional direct preference optimization (mcDPO) for advancing the TiMT. Particularly, the proposed mcDPO not only guides the MLLM in rejecting repetition output by creating text output preference pairs automatically, but also guides the MLLM in paying more attention to text information in images by creating image input preference pairs. Furthermore, we build a high-quality benchmark called MCiT for comprehensively evaluating the TiMT capabilities of InImageTrans. Experimental results show that the proposed method significantly outperforms existing open-source MLLMs on MCiT."
}
Markdown (Informal)
[InImageTrans: Multimodal LLM-based Text Image Machine Translation](https://preview.aclanthology.org/landing_page/2025.findings-acl.1039/) (Zuo et al., Findings 2025)
ACL