@inproceedings{gu-etal-2023-language,
title = "Do language models have coherent mental models of everyday things?",
author = "Gu, Yuling and
Dalvi Mishra, Bhavana and
Clark, Peter",
editor = "Rogers, Anna and
Boyd-Graber, Jordan and
Okazaki, Naoaki",
booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
month = jul,
year = "2023",
address = "Toronto, Canada",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2023.acl-long.106/",
doi = "10.18653/v1/2023.acl-long.106",
pages = "1892--1913",
abstract = "When people think of everyday things like an egg, they typically have a mental image associated with it. This allows them to correctly judge, for example, that {\textquotedblleft}the yolk surrounds the shell{\textquotedblright} is a false statement. Do language models similarly have a coherent picture of such everyday things? To investigate this, we propose a benchmark dataset consisting of 100 everyday things, their parts, and the relationships between these parts, expressed as 11,720 {\textquotedblleft}X relation Y?{\textquotedblright} true/false questions. Using these questions as probes, we observe that state-of-the-art pre-trained language models (LMs) like GPT-3 and Macaw have fragments of knowledge about these everyday things, but do not have fully coherent {\textquotedblleft}parts mental models{\textquotedblright} (54-59{\%} accurate, 19-43{\%} conditional constraint violation). We propose an extension where we add a constraint satisfaction layer on top of the LM`s raw predictions to apply commonsense constraints. As well as removing inconsistencies, we find that this also significantly improves accuracy (by 16-20{\%}), suggesting how the incoherence of the LM`s pictures of everyday things can be significantly reduced."
}
Markdown (Informal)
[Do language models have coherent mental models of everyday things?](https://preview.aclanthology.org/add-emnlp-2024-awards/2023.acl-long.106/) (Gu et al., ACL 2023)
ACL