@inproceedings{iki-aizawa-2020-language,
title = "Language-{C}onditioned {F}eature {P}yramids for {V}isual {S}election {T}asks",
author = "Iki, Taichi and
Aizawa, Akiko",
booktitle = "Findings of the Association for Computational Linguistics: EMNLP 2020",
month = nov,
year = "2020",
address = "Online",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2020.findings-emnlp.420",
doi = "10.18653/v1/2020.findings-emnlp.420",
pages = "4687--4697",
abstract = "Referring expression comprehension, which is the ability to locate language to an object in an image, plays an important role in creating common ground. Many models that fuse visual and linguistic features have been proposed. However, few models consider the fusion of linguistic features with multiple visual features with different sizes of receptive fields, though the proper size of the receptive field of visual features intuitively varies depending on expressions. In this paper, we introduce a neural network architecture that modulates visual features with varying sizes of receptive field by linguistic features. We evaluate our architecture on tasks related to referring expression comprehension in two visual dialogue games. The results show the advantages and broad applicability of our architecture. Source code is available at https://github.com/Alab-NII/lcfp .",
}
<?xml version="1.0" encoding="UTF-8"?>
<modsCollection xmlns="http://www.loc.gov/mods/v3">
<mods ID="iki-aizawa-2020-language">
<titleInfo>
<title>Language-Conditioned Feature Pyramids for Visual Selection Tasks</title>
</titleInfo>
<name type="personal">
<namePart type="given">Taichi</namePart>
<namePart type="family">Iki</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<name type="personal">
<namePart type="given">Akiko</namePart>
<namePart type="family">Aizawa</namePart>
<role>
<roleTerm authority="marcrelator" type="text">author</roleTerm>
</role>
</name>
<originInfo>
<dateIssued>2020-nov</dateIssued>
</originInfo>
<typeOfResource>text</typeOfResource>
<relatedItem type="host">
<titleInfo>
<title>Findings of the Association for Computational Linguistics: EMNLP 2020</title>
</titleInfo>
<originInfo>
<publisher>Association for Computational Linguistics</publisher>
<place>
<placeTerm type="text">Online</placeTerm>
</place>
</originInfo>
<genre authority="marcgt">conference publication</genre>
</relatedItem>
<abstract>Referring expression comprehension, which is the ability to locate language to an object in an image, plays an important role in creating common ground. Many models that fuse visual and linguistic features have been proposed. However, few models consider the fusion of linguistic features with multiple visual features with different sizes of receptive fields, though the proper size of the receptive field of visual features intuitively varies depending on expressions. In this paper, we introduce a neural network architecture that modulates visual features with varying sizes of receptive field by linguistic features. We evaluate our architecture on tasks related to referring expression comprehension in two visual dialogue games. The results show the advantages and broad applicability of our architecture. Source code is available at https://github.com/Alab-NII/lcfp .</abstract>
<identifier type="citekey">iki-aizawa-2020-language</identifier>
<identifier type="doi">10.18653/v1/2020.findings-emnlp.420</identifier>
<location>
<url>https://aclanthology.org/2020.findings-emnlp.420</url>
</location>
<part>
<date>2020-nov</date>
<extent unit="page">
<start>4687</start>
<end>4697</end>
</extent>
</part>
</mods>
</modsCollection>
%0 Conference Proceedings
%T Language-Conditioned Feature Pyramids for Visual Selection Tasks
%A Iki, Taichi
%A Aizawa, Akiko
%S Findings of the Association for Computational Linguistics: EMNLP 2020
%D 2020
%8 nov
%I Association for Computational Linguistics
%C Online
%F iki-aizawa-2020-language
%X Referring expression comprehension, which is the ability to locate language to an object in an image, plays an important role in creating common ground. Many models that fuse visual and linguistic features have been proposed. However, few models consider the fusion of linguistic features with multiple visual features with different sizes of receptive fields, though the proper size of the receptive field of visual features intuitively varies depending on expressions. In this paper, we introduce a neural network architecture that modulates visual features with varying sizes of receptive field by linguistic features. We evaluate our architecture on tasks related to referring expression comprehension in two visual dialogue games. The results show the advantages and broad applicability of our architecture. Source code is available at https://github.com/Alab-NII/lcfp .
%R 10.18653/v1/2020.findings-emnlp.420
%U https://aclanthology.org/2020.findings-emnlp.420
%U https://doi.org/10.18653/v1/2020.findings-emnlp.420
%P 4687-4697
Markdown (Informal)
[Language-Conditioned Feature Pyramids for Visual Selection Tasks](https://aclanthology.org/2020.findings-emnlp.420) (Iki & Aizawa, Findings 2020)
ACL