@inproceedings{shah-etal-2020-expect,
title = "What do we expect from Multiple-choice {QA} Systems?",
author = "Shah, Krunal and
Gupta, Nitish and
Roth, Dan",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.317/",
doi = "10.18653/v1/2020.findings-emnlp.317",
pages = "3547--3553",
abstract = "The recent success of machine learning systems on various QA datasets could be interpreted as a significant improvement in models' language understanding abilities. However, using various perturbations, multiple recent works have shown that good performance on a dataset might not indicate performance that correlates well with human{'}s expectations from models that ``understand'' language. In this work we consider a top performing model on several Multiple Choice Question Answering (MCQA) datasets, and evaluate it against a set of expectations one might have from such a model, using a series of zero-information perturbations of the model{'}s inputs. Our results show that the model clearly falls short of our expectations, and motivates a modified training approach that forces the model to better attend to the inputs. We show that the new training paradigm leads to a model that performs on par with the original model while better satisfying our expectations."
}
Markdown (Informal)
[What do we expect from Multiple-choice QA Systems?](https://preview.aclanthology.org/fix-sig-urls/2020.findings-emnlp.317/) (Shah et al., Findings 2020)
ACL