@inproceedings{zheng-etal-2022-towards-unifying,
title = "Towards Unifying Reference Expression Generation and Comprehension",
author = "Zheng, Duo and
Kong, Tao and
Jing, Ya and
Wang, Jiaan and
Wang, Xiaojie",
editor = "Goldberg, Yoav and
Kozareva, Zornitsa and
Zhang, Yue",
booktitle = "Proceedings of the 2022 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2022",
address = "Abu Dhabi, United Arab Emirates",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2022.emnlp-main.442/",
doi = "10.18653/v1/2022.emnlp-main.442",
pages = "6598--6611",
abstract = "Reference Expression Generation (REG) and Comprehension (REC) are two highly correlated tasks. Modeling REG and REC simultaneously for utilizing the relation between them is a promising way to improve both. However, the problem of distinct inputs, as well as building connections between them in a single model, brings challenges to the design and training of the joint model. To address the problems, we propose a unified model for REG and REC, named UniRef. It unifies these two tasks with the carefully-designed Image-Region-Text Fusion layer (IRTF), which fuses the image, region and text via the image cross-attention and region cross-attention. Additionally, IRTF could generate pseudo input regions for the REC task to enable a uniform way for sharing the identical representation space across the REC and REG. We further propose Vision-conditioned Masked Language Modeling (VMLM) and Text-Conditioned Region Prediction (TRP) to pre-train UniRef model on multi-granular corpora. The VMLM and TRP are directly related to REG and REC, respectively, but could help each other. We conduct extensive experiments on three benchmark datasets, RefCOCO, RefCOCO+ and RefCOCOg. Experimental results show that our model outperforms previous state-of-the-art methods on both REG and REC."
}
Markdown (Informal)
[Towards Unifying Reference Expression Generation and Comprehension](https://preview.aclanthology.org/fix-sig-urls/2022.emnlp-main.442/) (Zheng et al., EMNLP 2022)
ACL