@inproceedings{ma-etal-2022-unitranser,
title = "{U}ni{T}ran{S}e{R}: A Unified Transformer Semantic Representation Framework for Multimodal Task-Oriented Dialog System",
author = "Ma, Zhiyuan and
Li, Jianjun and
Li, Guohui and
Cheng, Yongjing",
editor = "Muresan, Smaranda and
Nakov, Preslav and
Villavicencio, Aline",
booktitle = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = may,
year = "2022",
address = "Dublin, Ireland",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.9/",
doi = "10.18653/v1/2022.acl-long.9",
pages = "103--114",
abstract = "As a more natural and intelligent interaction manner, multimodal task-oriented dialog system recently has received great attention and many remarkable progresses have been achieved. Nevertheless, almost all existing studies follow the pipeline to first learn intra-modal features separately and then conduct simple feature concatenation or attention-based feature fusion to generate responses, which hampers them from learning inter-modal interactions and conducting cross-modal feature alignment for generating more intention-aware responses. To address these issues, we propose UniTranSeR, a Unified Transformer Semantic Representation framework with feature alignment and intention reasoning for multimodal dialog systems. Specifically, we first embed the multimodal features into a unified Transformer semantic space to prompt inter-modal interactions, and then devise a feature alignment and intention reasoning (FAIR) layer to perform cross-modal entity alignment and fine-grained key-value reasoning, so as to effectively identify user`s intention for generating more accurate responses. Experimental results verify the effectiveness of UniTranSeR, showing that it significantly outperforms state-of-the-art approaches on the representative MMD dataset."
}
Markdown (Informal)
[UniTranSeR: A Unified Transformer Semantic Representation Framework for Multimodal Task-Oriented Dialog System](https://preview.aclanthology.org/jlcl-multiple-ingestion/2022.acl-long.9/) (Ma et al., ACL 2022)
ACL