@inproceedings{li-etal-2020-mapping,
title = "Mapping Natural Language Instructions to Mobile {UI} Action Sequences",
author = "Li, Yang and
He, Jiacong and
Zhou, Xin and
Zhang, Yuan and
Baldridge, Jason",
editor = "Jurafsky, Dan and
Chai, Joyce and
Schluter, Natalie and
Tetreault, Joel",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.729/",
doi = "10.18653/v1/2020.acl-main.729",
pages = "8198--8210",
abstract = "We present a new problem: grounding natural language instructions to mobile user interface actions, and create three new datasets for it. For full task evaluation, we create PixelHelp, a corpus that pairs English instructions with actions performed by people on a mobile UI emulator. To scale training, we decouple the language and action data by (a) annotating action phrase spans in How-To instructions and (b) synthesizing grounded descriptions of actions for mobile user interfaces. We use a Transformer to extract action phrase tuples from long-range natural language instructions. A grounding Transformer then contextually represents UI objects using both their content and screen position and connects them to object descriptions. Given a starting screen and instruction, our model achieves 70.59{\%} accuracy on predicting complete ground-truth action sequences in PixelHelp."
}
Markdown (Informal)
[Mapping Natural Language Instructions to Mobile UI Action Sequences](https://preview.aclanthology.org/fix-sig-urls/2020.acl-main.729/) (Li et al., ACL 2020)
ACL