@inproceedings{cirik-etal-2020-refer360,
title = "Refer360$^{\circ}$: A Referring Expression Recognition Dataset in 360$^{\circ}$ Images",
author = "Cirik, Volkan and
Berg-Kirkpatrick, Taylor and
Morency, Louis-Philippe",
booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
month = jul,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.acl-main.644",
doi = "10.18653/v1/2020.acl-main.644",
pages = "7189--7202",
abstract = "We propose a novel large-scale referring expression recognition dataset, Refer360{\mbox{$^\circ$}}, consisting of 17,137 instruction sequences and ground-truth actions for completing these instructions in 360{\mbox{$^\circ$}} scenes. Refer360{\mbox{$^\circ$}} differs from existing related datasets in three ways. First, we propose a more realistic scenario where instructors and the followers have partial, yet dynamic, views of the scene {--} followers continuously modify their field-of-view (FoV) while interpreting instructions that specify a final target location. Second, instructions to find the target location consist of multiple steps for followers who will start at random FoVs. As a result, intermediate instructions are strongly grounded in object references, and followers must identify intermediate FoVs to find the final target location correctly. Third, the target locations are neither restricted to predefined objects nor chosen by annotators; instead, they are distributed randomly across scenes. This {``}point anywhere{''} approach leads to more linguistically complex instructions, as shown in our analyses. Our examination of the dataset shows that Refer360{\mbox{$^\circ$}} manifests linguistically rich phenomena in a language grounding task that poses novel challenges for computational modeling of language, vision, and navigation.",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="cirik-etal-2020-refer360">
<titleInfo>
<title>Refer360$^\circ$: A Referring Expression Recognition Dataset in 360$^\circ$ Images</title>
</titleInfo>
<name type="personal">
<namePart type="given">Volkan</namePart>
<namePart type="family">Cirik</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Taylor</namePart>
<namePart type="family">Berg-Kirkpatrick</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Louis-Philippe</namePart>
<namePart type="family">Morency</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-jul</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>We propose a novel large-scale referring expression recognition dataset, Refer360$°$, consisting of 17,137 instruction sequences and ground-truth actions for completing these instructions in 360$°$ scenes. Refer360$°$ differs from existing related datasets in three ways. First, we propose a more realistic scenario where instructors and the followers have partial, yet dynamic, views of the scene – followers continuously modify their field-of-view (FoV) while interpreting instructions that specify a final target location. Second, instructions to find the target location consist of multiple steps for followers who will start at random FoVs. As a result, intermediate instructions are strongly grounded in object references, and followers must identify intermediate FoVs to find the final target location correctly. Third, the target locations are neither restricted to predefined objects nor chosen by annotators; instead, they are distributed randomly across scenes. This “point anywhere” approach leads to more linguistically complex instructions, as shown in our analyses. Our examination of the dataset shows that Refer360$°$ manifests linguistically rich phenomena in a language grounding task that poses novel challenges for computational modeling of language, vision, and navigation.</abstract>
<identifier type="citekey">cirik-etal-2020-refer360</identifier>
<identifier type="doi">10.18653/v1/2020.acl-main.644</identifier>
<location>
<url>https://aclanthology.org/2020.acl-main.644</url>
</location>
<part>
<date>2020-jul</date>
<extent unit="page">
<start>7189</start>
<end>7202</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Refer360$^\circ$: A Referring Expression Recognition Dataset in 360$^\circ$ Images
%A Cirik, Volkan
%A Berg-Kirkpatrick, Taylor
%A Morency, Louis-Philippe
%S Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics
%D 2020
%8 jul
%I Association for Computational Linguistics
%C Online
%F cirik-etal-2020-refer360
%X We propose a novel large-scale referring expression recognition dataset, Refer360$°$, consisting of 17,137 instruction sequences and ground-truth actions for completing these instructions in 360$°$ scenes. Refer360$°$ differs from existing related datasets in three ways. First, we propose a more realistic scenario where instructors and the followers have partial, yet dynamic, views of the scene – followers continuously modify their field-of-view (FoV) while interpreting instructions that specify a final target location. Second, instructions to find the target location consist of multiple steps for followers who will start at random FoVs. As a result, intermediate instructions are strongly grounded in object references, and followers must identify intermediate FoVs to find the final target location correctly. Third, the target locations are neither restricted to predefined objects nor chosen by annotators; instead, they are distributed randomly across scenes. This “point anywhere” approach leads to more linguistically complex instructions, as shown in our analyses. Our examination of the dataset shows that Refer360$°$ manifests linguistically rich phenomena in a language grounding task that poses novel challenges for computational modeling of language, vision, and navigation.
%R 10.18653/v1/2020.acl-main.644
%U https://aclanthology.org/2020.acl-main.644
%U https://doi.org/10.18653/v1/2020.acl-main.644
%P 7189-7202
Markdown (Informal)
[Refer360∘: A Referring Expression Recognition Dataset in 360: A Referring Expression Recognition Dataset in 360∘ Images Images](https://aclanthology.org/2020.acl-main.644) (Cirik et al., ACL 2020)
ACL