@inproceedings{jansen-2020-visually,
title = "Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions",
author = "Jansen, Peter",
editor = "Cohn, Trevor and
He, Yulan and
Liu, Yang",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/add-emnlp-2024-awards/2020.findings-emnlp.395/",
doi = "10.18653/v1/2020.findings-emnlp.395",
pages = "4412--4417",
abstract = "The recently proposed ALFRED challenge task aims for a virtual robotic agent to complete complex multi-step everyday tasks in a virtual home environment from high-level natural language directives, such as {\textquotedblleft}put a hot piece of bread on a plate{\textquotedblright}. Currently, the best-performing models are able to complete less than 1{\%} of these tasks successfully. In this work we focus on modeling the translation problem of converting natural language directives into detailed multi-step sequences of actions that accomplish those goals in the virtual environment. We empirically demonstrate that it is possible to generate gold multi-step plans from language directives alone without any visual input in 26{\%} of unseen cases. When a small amount of visual information, the starting location in the virtual environment, is incorporated, our best-performing GPT-2 model successfully generates gold command sequences in 58{\%} of cases, suggesting contextualized language models may provide strong planning modules for grounded virtual agents."
}
Markdown (Informal)
[Visually-Grounded Planning without Vision: Language Models Infer Detailed Plans from High-level Instructions](https://preview.aclanthology.org/add-emnlp-2024-awards/2020.findings-emnlp.395/) (Jansen, Findings 2020)
ACL