@inproceedings{yu-etal-2017-voila,
    title = "{VOILA}: An Optimised Dialogue System for Interactively Learning Visually-Grounded Word Meanings (Demonstration System)",
    author = "Yu, Yanchao  and
      Eshghi, Arash  and
      Lemon, Oliver",
    editor = "Jokinen, Kristiina  and
      Stede, Manfred  and
      DeVault, David  and
      Louis, Annie",
    booktitle = "Proceedings of the 18th Annual {SIG}dial Meeting on Discourse and Dialogue",
    month = aug,
    year = "2017",
    address = {Saarbr{\"u}cken, Germany},
    publisher = "Association for Computational Linguistics",
    url = "https://preview.aclanthology.org/iwcs-25-ingestion/W17-5524/",
    doi = "10.18653/v1/W17-5524",
    pages = "197--200",
    abstract = "We present VOILA: an optimised, multi-modal dialogue agent for interactive learning of visually grounded word meanings from a human user. VOILA is: (1) able to learn new visual categories interactively from users from scratch; (2) trained on real human-human dialogues in the same domain, and so is able to conduct natural spontaneous dialogue; (3) optimised to find the most effective trade-off between the accuracy of the visual categories it learns and the cost it incurs to users. VOILA is deployed on Furhat, a human-like, multi-modal robot head with back-projection of the face, and a graphical virtual character."
}Markdown (Informal)
[VOILA: An Optimised Dialogue System for Interactively Learning Visually-Grounded Word Meanings (Demonstration System)](https://preview.aclanthology.org/iwcs-25-ingestion/W17-5524/) (Yu et al., SIGDIAL 2017)
ACL