@inproceedings{gokhale-etal-2020-mutant,
title = "{MUTANT}: A Training Paradigm for Out-of-Distribution Generalization in Visual Question Answering",
author = "Gokhale, Tejas and
Banerjee, Pratyay and
Baral, Chitta and
Yang, Yezhou",
editor = "Webber, Bonnie and
Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.63/",
doi = "10.18653/v1/2020.emnlp-main.63",
pages = "878--892",
abstract = "While progress has been made on the visual question answering leaderboards, models often utilize spurious correlations and priors in datasets under the i.i.d. setting. As such, evaluation on out-of-distribution (OOD) test samples has emerged as a proxy for generalization. In this paper, we present \textit{MUTANT}, a training paradigm that exposes the model to perceptually similar, yet semantically distinct \textit{mutations} of the input, to improve OOD generalization, such as the VQA-CP challenge. Under this paradigm, models utilize a consistency-constrained training objective to understand the effect of semantic changes in input (question-image pair) on the output (answer). Unlike existing methods on VQA-CP, \textit{MUTANT} does not rely on the knowledge about the nature of train and test answer distributions. \textit{MUTANT} establishes a new state-of-the-art accuracy on VQA-CP with a 10.57{\%} improvement. Our work opens up avenues for the use of semantic input mutations for OOD generalization in question answering."
}
Markdown (Informal)
[MUTANT: A Training Paradigm for Out-of-Distribution Generalization in Visual Question Answering](https://preview.aclanthology.org/jlcl-multiple-ingestion/2020.emnlp-main.63/) (Gokhale et al., EMNLP 2020)
ACL