@inproceedings{feiyu-etal-2024-bridging,
title = "Bridging the Gap between Authentic and Answer-Guided Images for {C}hinese Vision-Language Understanding Enhancement",
author = "Feiyu, Wang and
Wenyu, Guo and
Dong, Yu and
Chen, Kang and
Pengyuan, Liu",
editor = "Lin, Hongfei and
Tan, Hongye and
Li, Bin",
booktitle = "Proceedings of the 23rd Chinese National Conference on Computational Linguistics (Volume 3: Evaluations)",
month = jul,
year = "2024",
address = "Taiyuan, China",
publisher = "Chinese Information Processing Society of China",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.ccl-3.40/",
pages = "353--362",
language = "eng",
abstract = "{\textquotedblleft}The objective of the Chinese Vision-Language Understanding Evaluation (CVLUE) is to comprehensively assess the performance of Chinese vision-language multimodal pre-trained models in multimodal modeling and understanding across four tasks: Image-Text Retrieval, Visual Question Answering, Visual Grounding, and Visual Dialog. To enhance the models' performance across various multimodal tasks, this paper propose a multimodal information understanding enhancement method based on answer-guided images. Firstly, we propose task-specific methods for answer-guided image generation. Secondly, the authentic and answer-guided images are fed into the model for multimodal fine-tuning, respectively. Finally, training objectives are set for different tasks to minimize the gap between the answer-guided images and authentic images, thereby supervising the results produced by the authentic images utlizing answer-guided images. The experimental results demonstrate the effectiveness of the proposed method.{\textquotedblright}"
}
Markdown (Informal)
[Bridging the Gap between Authentic and Answer-Guided Images for Chinese Vision-Language Understanding Enhancement](https://preview.aclanthology.org/jlcl-multiple-ingestion/2024.ccl-3.40/) (Feiyu et al., CCL 2024)
ACL