@inproceedings{misra-etal-2017-mapping,
title = "Mapping Instructions and Visual Observations to Actions with Reinforcement Learning",
author = "Misra, Dipendra and
Langford, John and
Artzi, Yoav",
editor = "Palmer, Martha and
Hwa, Rebecca and
Riedel, Sebastian",
booktitle = "Proceedings of the 2017 Conference on Empirical Methods in Natural Language Processing",
month = sep,
year = "2017",
address = "Copenhagen, Denmark",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/D17-1106/",
doi = "10.18653/v1/D17-1106",
pages = "1004--1015",
abstract = "We propose to directly map raw visual observations and text input to actions for instruction execution. While existing approaches assume access to structured environment representations or use a pipeline of separately trained models, we learn a single model to jointly reason about linguistic and visual input. We use reinforcement learning in a contextual bandit setting to train a neural network agent. To guide the agent{'}s exploration, we use reward shaping with different forms of supervision. Our approach does not require intermediate representations, planning procedures, or training different models. We evaluate in a simulated environment, and show significant improvements over supervised learning and common reinforcement learning variants."
}
Markdown (Informal)
[Mapping Instructions and Visual Observations to Actions with Reinforcement Learning](https://preview.aclanthology.org/fix-sig-urls/D17-1106/) (Misra et al., EMNLP 2017)
ACL